1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-NODL %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL %s 8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-DL %s 9 10define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, 11; GFX7-LABEL: udot4_acc32: 12; GFX7: ; %bb.0: ; %entry 13; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 14; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 15; GFX7-NEXT: s_mov_b32 s3, 0xf000 16; GFX7-NEXT: s_mov_b32 s6, 0 17; GFX7-NEXT: s_mov_b32 s7, s3 18; GFX7-NEXT: s_waitcnt lgkmcnt(0) 19; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 20; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 21; GFX7-NEXT: v_mov_b32_e32 v1, 0 22; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 23; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 24; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 25; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 26; GFX7-NEXT: s_mov_b32 s2, -1 27; GFX7-NEXT: s_waitcnt vmcnt(1) 28; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 29; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 30; GFX7-NEXT: s_waitcnt vmcnt(0) 31; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 32; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 33; GFX7-NEXT: s_waitcnt lgkmcnt(0) 34; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, s4 35; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 36; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 37; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 38; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 39; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 40; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 41; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 42; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 43; GFX7-NEXT: s_endpgm 44; 45; GFX8-LABEL: udot4_acc32: 46; GFX8: ; %bb.0: ; %entry 47; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 48; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 49; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 50; GFX8-NEXT: s_waitcnt lgkmcnt(0) 51; GFX8-NEXT: v_mov_b32_e32 v1, s1 52; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 53; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 54; GFX8-NEXT: flat_load_dword v3, v[0:1] 55; GFX8-NEXT: v_mov_b32_e32 v1, s3 56; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 57; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 58; GFX8-NEXT: flat_load_dword v0, v[0:1] 59; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 60; GFX8-NEXT: s_waitcnt vmcnt(1) 61; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 62; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 63; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 64; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 65; GFX8-NEXT: s_waitcnt vmcnt(0) 66; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 67; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 68; GFX8-NEXT: s_waitcnt lgkmcnt(0) 69; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 70; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 71; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 72; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 73; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 74; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 75; GFX8-NEXT: v_mov_b32_e32 v0, s4 76; GFX8-NEXT: v_mov_b32_e32 v1, s5 77; GFX8-NEXT: flat_store_dword v[0:1], v2 78; GFX8-NEXT: s_endpgm 79; 80; GFX9-NODL-LABEL: udot4_acc32: 81; GFX9-NODL: ; %bb.0: ; %entry 82; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 83; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 84; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 85; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 86; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 87; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 88; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 89; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 90; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 91; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 92; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 93; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 94; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 95; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 96; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 97; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 98; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 99; GFX9-NODL-NEXT: s_endpgm 100; 101; GFX9-DL-LABEL: udot4_acc32: 102; GFX9-DL: ; %bb.0: ; %entry 103; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 104; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 105; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 106; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 107; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 108; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 109; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 110; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 111; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 112; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 113; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 114; GFX9-DL-NEXT: s_endpgm 115; 116; GFX10-DL-LABEL: udot4_acc32: 117; GFX10-DL: ; %bb.0: ; %entry 118; GFX10-DL-NEXT: s_clause 0x1 119; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 120; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 121; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 122; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 123; GFX10-DL-NEXT: s_clause 0x1 124; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 125; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 126; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 127; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 128; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 129; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 130; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 131; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] 132; GFX10-DL-NEXT: s_endpgm 133; 134; GFX11-DL-LABEL: udot4_acc32: 135; GFX11-DL: ; %bb.0: ; %entry 136; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 137; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 138; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 139; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 140; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 141; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 142; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 143; GFX11-DL-NEXT: s_clause 0x1 144; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 145; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 146; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 147; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 148; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 149; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 150; GFX11-DL-NEXT: s_endpgm 151 ptr addrspace(1) %src2, 152 ptr addrspace(1) nocapture %dst) { 153entry: 154 %idx = call i32 @llvm.amdgcn.workitem.id.x() 155 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 156 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 157 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 158 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 159 160 %v1e0 = extractelement <4 x i8> %vec1, i64 0 161 %cv1e0 = zext i8 %v1e0 to i32 162 %v2e0 = extractelement <4 x i8> %vec2, i64 0 163 %cv2e0 = zext i8 %v2e0 to i32 164 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 165 166 %v1e1 = extractelement <4 x i8> %vec1, i64 1 167 %cv1e1 = zext i8 %v1e1 to i32 168 %v2e1 = extractelement <4 x i8> %vec2, i64 1 169 %cv2e1 = zext i8 %v2e1 to i32 170 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 171 172 %v1e2 = extractelement <4 x i8> %vec1, i64 2 173 %cv1e2 = zext i8 %v1e2 to i32 174 %v2e2 = extractelement <4 x i8> %vec2, i64 2 175 %cv2e2 = zext i8 %v2e2 to i32 176 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 177 178 %v1e3 = extractelement <4 x i8> %vec1, i64 3 179 %cv1e3 = zext i8 %v1e3 to i32 180 %v2e3 = extractelement <4 x i8> %vec2, i64 3 181 %cv2e3 = zext i8 %v2e3 to i32 182 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 183 184 %acc = load i32, ptr addrspace(1) %dst, align 4 185 %mad1 = add i32 %mul1, %acc 186 %mad2 = add i32 %mad1, %mul2 187 %mad3 = add i32 %mad2, %mul3 188 %mad4 = add i32 %mad3, %mul4 189 190 store i32 %mad4, ptr addrspace(1) %dst, align 4 191 ret void 192} 193 194define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, 195; GFX7-LABEL: udot4_acc16: 196; GFX7: ; %bb.0: ; %entry 197; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 198; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 199; GFX7-NEXT: s_mov_b32 s3, 0xf000 200; GFX7-NEXT: s_mov_b32 s6, 0 201; GFX7-NEXT: s_mov_b32 s7, s3 202; GFX7-NEXT: s_waitcnt lgkmcnt(0) 203; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 204; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 205; GFX7-NEXT: v_mov_b32_e32 v1, 0 206; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 207; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 208; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 209; GFX7-NEXT: s_mov_b32 s2, -1 210; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 211; GFX7-NEXT: s_waitcnt vmcnt(2) 212; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 213; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 214; GFX7-NEXT: s_waitcnt vmcnt(1) 215; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 216; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 217; GFX7-NEXT: s_waitcnt vmcnt(0) 218; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 219; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 220; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 221; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 222; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 223; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 224; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 225; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 226; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 227; GFX7-NEXT: s_endpgm 228; 229; GFX8-LABEL: udot4_acc16: 230; GFX8: ; %bb.0: ; %entry 231; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 232; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 233; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 234; GFX8-NEXT: v_mov_b32_e32 v5, 0xff 235; GFX8-NEXT: s_waitcnt lgkmcnt(0) 236; GFX8-NEXT: v_mov_b32_e32 v1, s1 237; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 238; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 239; GFX8-NEXT: flat_load_dword v3, v[0:1] 240; GFX8-NEXT: v_mov_b32_e32 v1, s3 241; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 242; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 243; GFX8-NEXT: flat_load_dword v2, v[0:1] 244; GFX8-NEXT: v_mov_b32_e32 v0, s4 245; GFX8-NEXT: v_mov_b32_e32 v1, s5 246; GFX8-NEXT: flat_load_ushort v4, v[0:1] 247; GFX8-NEXT: s_waitcnt vmcnt(2) 248; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v3 249; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 250; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 251; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 252; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 253; GFX8-NEXT: s_waitcnt vmcnt(1) 254; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v2 255; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 256; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9 257; GFX8-NEXT: s_waitcnt vmcnt(0) 258; GFX8-NEXT: v_mad_u16 v4, v6, v7, v4 259; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 260; GFX8-NEXT: v_mad_u16 v4, v8, v9, v4 261; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 262; GFX8-NEXT: v_mad_u16 v4, v10, v5, v4 263; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 264; GFX8-NEXT: flat_store_short v[0:1], v2 265; GFX8-NEXT: s_endpgm 266; 267; GFX9-NODL-LABEL: udot4_acc16: 268; GFX9-NODL: ; %bb.0: ; %entry 269; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 270; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 271; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 272; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 273; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 274; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 275; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 276; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[6:7] 277; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff 278; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 279; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 280; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 281; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v2 282; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 283; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 284; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 285; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7 286; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 287; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 288; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 289; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 290; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 291; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 292; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 293; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 294; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 295; GFX9-NODL-NEXT: global_store_short v0, v1, s[6:7] 296; GFX9-NODL-NEXT: s_endpgm 297; 298; GFX9-DL-LABEL: udot4_acc16: 299; GFX9-DL: ; %bb.0: ; %entry 300; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 301; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 302; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 303; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 304; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 305; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 306; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] 307; GFX9-DL-NEXT: global_load_ushort v4, v1, s[6:7] 308; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 309; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 310; GFX9-DL-NEXT: global_store_short v1, v0, s[6:7] 311; GFX9-DL-NEXT: s_endpgm 312; 313; GFX10-DL-LABEL: udot4_acc16: 314; GFX10-DL: ; %bb.0: ; %entry 315; GFX10-DL-NEXT: s_clause 0x1 316; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 317; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 318; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 319; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 320; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 321; GFX10-DL-NEXT: s_clause 0x1 322; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 323; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] 324; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7] 325; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 326; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 327; GFX10-DL-NEXT: global_store_short v1, v0, s[6:7] 328; GFX10-DL-NEXT: s_endpgm 329; 330; GFX11-DL-LABEL: udot4_acc16: 331; GFX11-DL: ; %bb.0: ; %entry 332; GFX11-DL-NEXT: s_clause 0x1 333; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 334; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 335; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 336; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 337; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 338; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 339; GFX11-DL-NEXT: s_clause 0x1 340; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] 341; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 342; GFX11-DL-NEXT: global_load_u16 v3, v1, s[4:5] 343; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 344; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 345; GFX11-DL-NEXT: global_store_b16 v1, v0, s[4:5] 346; GFX11-DL-NEXT: s_endpgm 347 ptr addrspace(1) %src2, 348 ptr addrspace(1) nocapture %dst) { 349entry: 350 %idx = call i32 @llvm.amdgcn.workitem.id.x() 351 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 352 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 353 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 354 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 355 356 %v1e0 = extractelement <4 x i8> %vec1, i64 0 357 %cv1e0 = zext i8 %v1e0 to i16 358 %v2e0 = extractelement <4 x i8> %vec2, i64 0 359 %cv2e0 = zext i8 %v2e0 to i16 360 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0 361 362 %v1e1 = extractelement <4 x i8> %vec1, i64 1 363 %cv1e1 = zext i8 %v1e1 to i16 364 %v2e1 = extractelement <4 x i8> %vec2, i64 1 365 %cv2e1 = zext i8 %v2e1 to i16 366 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1 367 368 %v1e2 = extractelement <4 x i8> %vec1, i64 2 369 %cv1e2 = zext i8 %v1e2 to i16 370 %v2e2 = extractelement <4 x i8> %vec2, i64 2 371 %cv2e2 = zext i8 %v2e2 to i16 372 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2 373 374 %v1e3 = extractelement <4 x i8> %vec1, i64 3 375 %cv1e3 = zext i8 %v1e3 to i16 376 %v2e3 = extractelement <4 x i8> %vec2, i64 3 377 %cv2e3 = zext i8 %v2e3 to i16 378 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3 379 380 %acc = load i16, ptr addrspace(1) %dst, align 2 381 %mad1 = add i16 %mul1, %acc 382 %mad2 = add i16 %mad1, %mul2 383 %mad3 = add i16 %mad2, %mul3 384 %mad4 = add i16 %mad3, %mul4 385 386 store i16 %mad4, ptr addrspace(1) %dst, align 2 387 ret void 388} 389 390define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, 391; GFX7-LABEL: udot4_acc8: 392; GFX7: ; %bb.0: ; %entry 393; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 394; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 395; GFX7-NEXT: s_mov_b32 s3, 0xf000 396; GFX7-NEXT: s_mov_b32 s6, 0 397; GFX7-NEXT: s_mov_b32 s7, s3 398; GFX7-NEXT: s_waitcnt lgkmcnt(0) 399; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 400; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 401; GFX7-NEXT: v_mov_b32_e32 v1, 0 402; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 403; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 404; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 405; GFX7-NEXT: s_mov_b32 s2, -1 406; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 407; GFX7-NEXT: s_waitcnt vmcnt(2) 408; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 409; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 410; GFX7-NEXT: s_waitcnt vmcnt(1) 411; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 412; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 413; GFX7-NEXT: s_waitcnt vmcnt(0) 414; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 415; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 416; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 417; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 418; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 419; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 420; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 421; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 422; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 423; GFX7-NEXT: s_endpgm 424; 425; GFX8-LABEL: udot4_acc8: 426; GFX8: ; %bb.0: ; %entry 427; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 428; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 429; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 430; GFX8-NEXT: s_waitcnt lgkmcnt(0) 431; GFX8-NEXT: v_mov_b32_e32 v1, s1 432; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 433; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 434; GFX8-NEXT: flat_load_dword v3, v[0:1] 435; GFX8-NEXT: v_mov_b32_e32 v1, s3 436; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 437; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 438; GFX8-NEXT: flat_load_dword v2, v[0:1] 439; GFX8-NEXT: v_mov_b32_e32 v0, s4 440; GFX8-NEXT: v_mov_b32_e32 v1, s5 441; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 442; GFX8-NEXT: s_waitcnt vmcnt(2) 443; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 444; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 445; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 446; GFX8-NEXT: s_waitcnt vmcnt(1) 447; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 448; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 449; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 450; GFX8-NEXT: s_waitcnt vmcnt(0) 451; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 452; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2 453; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 454; GFX8-NEXT: v_mad_u16 v2, v9, v10, v2 455; GFX8-NEXT: flat_store_byte v[0:1], v2 456; GFX8-NEXT: s_endpgm 457; 458; GFX9-NODL-LABEL: udot4_acc8: 459; GFX9-NODL: ; %bb.0: ; %entry 460; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 461; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 462; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 463; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 464; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 465; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 466; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 467; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[6:7] 468; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 469; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 470; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 471; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 472; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 473; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 474; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 475; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 476; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 477; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v6, v7, v1 478; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 479; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 480; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 481; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7] 482; GFX9-NODL-NEXT: s_endpgm 483; 484; GFX9-DL-LABEL: udot4_acc8: 485; GFX9-DL: ; %bb.0: ; %entry 486; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 487; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 488; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 489; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 490; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 491; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 492; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] 493; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7] 494; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 495; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 496; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7] 497; GFX9-DL-NEXT: s_endpgm 498; 499; GFX10-DL-LABEL: udot4_acc8: 500; GFX10-DL: ; %bb.0: ; %entry 501; GFX10-DL-NEXT: s_clause 0x1 502; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 503; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 504; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 505; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 506; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 507; GFX10-DL-NEXT: s_clause 0x1 508; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 509; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] 510; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] 511; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 512; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 513; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] 514; GFX10-DL-NEXT: s_endpgm 515; 516; GFX11-DL-LABEL: udot4_acc8: 517; GFX11-DL: ; %bb.0: ; %entry 518; GFX11-DL-NEXT: s_clause 0x1 519; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 520; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 521; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 522; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 523; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 524; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 525; GFX11-DL-NEXT: s_clause 0x1 526; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] 527; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 528; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5] 529; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 530; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 531; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5] 532; GFX11-DL-NEXT: s_endpgm 533 ptr addrspace(1) %src2, 534 ptr addrspace(1) nocapture %dst) { 535entry: 536 %idx = call i32 @llvm.amdgcn.workitem.id.x() 537 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 538 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 539 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 540 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 541 542 %v1e0 = extractelement <4 x i8> %vec1, i64 0 543 %v2e0 = extractelement <4 x i8> %vec2, i64 0 544 %mul1 = mul nuw nsw i8 %v1e0, %v2e0 545 546 %v1e1 = extractelement <4 x i8> %vec1, i64 1 547 %v2e1 = extractelement <4 x i8> %vec2, i64 1 548 %mul2 = mul nuw nsw i8 %v1e1, %v2e1 549 550 %v1e2 = extractelement <4 x i8> %vec1, i64 2 551 %v2e2 = extractelement <4 x i8> %vec2, i64 2 552 %mul3 = mul nuw nsw i8 %v1e2, %v2e2 553 554 %v1e3 = extractelement <4 x i8> %vec1, i64 3 555 %v2e3 = extractelement <4 x i8> %vec2, i64 3 556 %mul4 = mul nuw nsw i8 %v1e3, %v2e3 557 558 %acc = load i8, ptr addrspace(1) %dst, align 2 559 %mad1 = add i8 %mul1, %acc 560 %mad2 = add i8 %mad1, %mul2 561 %mad3 = add i8 %mad2, %mul3 562 %mad4 = add i8 %mad3, %mul4 563 564 store i8 %mad4, ptr addrspace(1) %dst, align 2 565 ret void 566} 567 568define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, 569; GFX7-LABEL: udot2_8: 570; GFX7: ; %bb.0: ; %entry 571; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 572; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 573; GFX7-NEXT: s_mov_b32 s7, 0xf000 574; GFX7-NEXT: s_mov_b32 s10, 0 575; GFX7-NEXT: s_mov_b32 s11, s7 576; GFX7-NEXT: s_waitcnt lgkmcnt(0) 577; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 578; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 579; GFX7-NEXT: v_mov_b32_e32 v1, 0 580; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 581; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 582; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 583; GFX7-NEXT: s_mov_b32 s6, -1 584; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 585; GFX7-NEXT: s_waitcnt vmcnt(2) 586; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 587; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 588; GFX7-NEXT: s_waitcnt vmcnt(1) 589; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 590; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 591; GFX7-NEXT: s_waitcnt vmcnt(0) 592; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 593; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 594; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 595; GFX7-NEXT: s_endpgm 596; 597; GFX8-LABEL: udot2_8: 598; GFX8: ; %bb.0: ; %entry 599; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 600; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 601; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 602; GFX8-NEXT: s_waitcnt lgkmcnt(0) 603; GFX8-NEXT: v_mov_b32_e32 v1, s1 604; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 605; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 606; GFX8-NEXT: flat_load_dword v3, v[0:1] 607; GFX8-NEXT: v_mov_b32_e32 v1, s3 608; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 609; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 610; GFX8-NEXT: flat_load_dword v2, v[0:1] 611; GFX8-NEXT: v_mov_b32_e32 v0, s4 612; GFX8-NEXT: v_mov_b32_e32 v1, s5 613; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 614; GFX8-NEXT: s_waitcnt vmcnt(2) 615; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3 616; GFX8-NEXT: s_waitcnt vmcnt(1) 617; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2 618; GFX8-NEXT: s_waitcnt vmcnt(0) 619; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 620; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 621; GFX8-NEXT: flat_store_byte v[0:1], v2 622; GFX8-NEXT: s_endpgm 623; 624; GFX9-NODL-LABEL: udot2_8: 625; GFX9-NODL: ; %bb.0: ; %entry 626; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 627; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 628; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 629; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 630; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 631; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1] 632; GFX9-NODL-NEXT: global_load_dword v3, v0, s[2:3] 633; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[6:7] 634; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 635; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 636; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 637; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 638; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 639; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 640; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2 641; GFX9-NODL-NEXT: global_store_byte v1, v0, s[6:7] 642; GFX9-NODL-NEXT: s_endpgm 643; 644; GFX9-DL-LABEL: udot2_8: 645; GFX9-DL: ; %bb.0: ; %entry 646; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 647; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 648; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 649; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 650; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 651; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 652; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 653; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] 654; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0100 655; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 656; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 657; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 658; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0 659; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 660; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 661; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] 662; GFX9-DL-NEXT: s_endpgm 663; 664; GFX10-DL-LABEL: udot2_8: 665; GFX10-DL: ; %bb.0: ; %entry 666; GFX10-DL-NEXT: s_clause 0x1 667; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 668; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 669; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 670; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 671; GFX10-DL-NEXT: s_clause 0x1 672; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 673; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 674; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 675; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] 676; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 677; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 678; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 679; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0100 680; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 681; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 682; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7] 683; GFX10-DL-NEXT: s_endpgm 684; 685; GFX11-DL-LABEL: udot2_8: 686; GFX11-DL: ; %bb.0: ; %entry 687; GFX11-DL-NEXT: s_clause 0x1 688; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 689; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 690; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 691; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 692; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 693; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 694; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 695; GFX11-DL-NEXT: s_clause 0x1 696; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 697; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 698; GFX11-DL-NEXT: global_load_u8 v3, v2, s[4:5] 699; GFX11-DL-NEXT: s_waitcnt vmcnt(2) 700; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 701; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 702; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100 703; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 704; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 705; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3 706; GFX11-DL-NEXT: global_store_b8 v2, v0, s[4:5] 707; GFX11-DL-NEXT: s_endpgm 708 ptr addrspace(1) %src2, 709 ptr addrspace(1) nocapture %dst) { 710entry: 711 %idx = call i32 @llvm.amdgcn.workitem.id.x() 712 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 713 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 714 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 715 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 716 717 %v1e0 = extractelement <4 x i8> %vec1, i64 0 718 %v2e0 = extractelement <4 x i8> %vec2, i64 0 719 %mul1 = mul nuw nsw i8 %v1e0, %v2e0 720 721 %v1e1 = extractelement <4 x i8> %vec1, i64 1 722 %v2e1 = extractelement <4 x i8> %vec2, i64 1 723 %mul2 = mul nuw nsw i8 %v1e1, %v2e1 724 725 %acc = load i8, ptr addrspace(1) %dst, align 2 726 %mad1 = add i8 %mul1, %acc 727 %mad2 = add i8 %mad1, %mul2 728 store i8 %mad2, ptr addrspace(1) %dst, align 2 729 ret void 730} 731 732define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, 733; GFX7-LABEL: udot4_CommutationInsideMAD: 734; GFX7: ; %bb.0: ; %entry 735; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 736; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 737; GFX7-NEXT: s_mov_b32 s3, 0xf000 738; GFX7-NEXT: s_mov_b32 s6, 0 739; GFX7-NEXT: s_mov_b32 s7, s3 740; GFX7-NEXT: s_waitcnt lgkmcnt(0) 741; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 742; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 743; GFX7-NEXT: v_mov_b32_e32 v1, 0 744; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 745; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 746; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 747; GFX7-NEXT: s_mov_b32 s2, -1 748; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 749; GFX7-NEXT: s_waitcnt vmcnt(2) 750; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 751; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 752; GFX7-NEXT: s_waitcnt vmcnt(1) 753; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 754; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 755; GFX7-NEXT: s_waitcnt vmcnt(0) 756; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1 757; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 758; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 759; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 760; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 761; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 762; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1 763; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 764; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 765; GFX7-NEXT: s_endpgm 766; 767; GFX8-LABEL: udot4_CommutationInsideMAD: 768; GFX8: ; %bb.0: ; %entry 769; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 770; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 771; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 772; GFX8-NEXT: s_waitcnt lgkmcnt(0) 773; GFX8-NEXT: v_mov_b32_e32 v1, s1 774; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 775; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 776; GFX8-NEXT: flat_load_dword v3, v[0:1] 777; GFX8-NEXT: v_mov_b32_e32 v1, s3 778; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 779; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 780; GFX8-NEXT: flat_load_dword v2, v[0:1] 781; GFX8-NEXT: v_mov_b32_e32 v0, s4 782; GFX8-NEXT: v_mov_b32_e32 v1, s5 783; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 784; GFX8-NEXT: s_waitcnt vmcnt(2) 785; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 786; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 787; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 788; GFX8-NEXT: s_waitcnt vmcnt(1) 789; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 790; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 791; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 792; GFX8-NEXT: s_waitcnt vmcnt(0) 793; GFX8-NEXT: v_mad_u16 v2, v2, v3, v4 794; GFX8-NEXT: v_mad_u16 v2, v8, v7, v2 795; GFX8-NEXT: v_mad_u16 v2, v6, v5, v2 796; GFX8-NEXT: v_mad_u16 v2, v10, v9, v2 797; GFX8-NEXT: flat_store_byte v[0:1], v2 798; GFX8-NEXT: s_endpgm 799; 800; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: 801; GFX9-NODL: ; %bb.0: ; %entry 802; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 803; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 804; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 805; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 806; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 807; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 808; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 809; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[6:7] 810; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 811; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 812; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 813; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 814; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 815; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 816; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 817; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3 818; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 819; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v7, v6, v1 820; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 821; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 822; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 823; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7] 824; GFX9-NODL-NEXT: s_endpgm 825; 826; GFX9-DL-LABEL: udot4_CommutationInsideMAD: 827; GFX9-DL: ; %bb.0: ; %entry 828; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 829; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 830; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 831; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 832; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 833; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 834; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] 835; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7] 836; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 837; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 838; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7] 839; GFX9-DL-NEXT: s_endpgm 840; 841; GFX10-DL-LABEL: udot4_CommutationInsideMAD: 842; GFX10-DL: ; %bb.0: ; %entry 843; GFX10-DL-NEXT: s_clause 0x1 844; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 845; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 846; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 847; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 848; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 849; GFX10-DL-NEXT: s_clause 0x1 850; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 851; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] 852; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] 853; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 854; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 855; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] 856; GFX10-DL-NEXT: s_endpgm 857; 858; GFX11-DL-LABEL: udot4_CommutationInsideMAD: 859; GFX11-DL: ; %bb.0: ; %entry 860; GFX11-DL-NEXT: s_clause 0x1 861; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 862; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 863; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 864; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 865; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 866; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 867; GFX11-DL-NEXT: s_clause 0x1 868; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] 869; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 870; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5] 871; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 872; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3 873; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5] 874; GFX11-DL-NEXT: s_endpgm 875 ptr addrspace(1) %src2, 876 ptr addrspace(1) nocapture %dst) { 877entry: 878 %idx = call i32 @llvm.amdgcn.workitem.id.x() 879 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 880 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 881 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 882 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 883 884 %v1e0 = extractelement <4 x i8> %vec1, i64 0 885 %v2e0 = extractelement <4 x i8> %vec2, i64 0 886 %mul1 = mul nuw nsw i8 %v2e0, %v1e0 887 888 %v1e1 = extractelement <4 x i8> %vec1, i64 1 889 %v2e1 = extractelement <4 x i8> %vec2, i64 1 890 %mul2 = mul nuw nsw i8 %v2e1, %v1e1 891 892 %v1e2 = extractelement <4 x i8> %vec1, i64 2 893 %v2e2 = extractelement <4 x i8> %vec2, i64 2 894 %mul3 = mul nuw nsw i8 %v2e2, %v1e2 895 896 %v1e3 = extractelement <4 x i8> %vec1, i64 3 897 %v2e3 = extractelement <4 x i8> %vec2, i64 3 898 %mul4 = mul nuw nsw i8 %v2e3, %v1e3 899 900 %acc = load i8, ptr addrspace(1) %dst, align 2 901 %mad1 = add i8 %acc, %mul1 902 %mad2 = add i8 %mul2, %mad1 903 %mad3 = add i8 %mul3, %mad2 904 %mad4 = add i8 %mul4, %mad3 905 906 store i8 %mad4, ptr addrspace(1) %dst, align 2 907 ret void 908} 909 910define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, 911; GFX7-LABEL: udot4_CommutationAccrossMADs: 912; GFX7: ; %bb.0: ; %entry 913; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 914; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 915; GFX7-NEXT: s_mov_b32 s3, 0xf000 916; GFX7-NEXT: s_mov_b32 s6, 0 917; GFX7-NEXT: s_mov_b32 s7, s3 918; GFX7-NEXT: s_waitcnt lgkmcnt(0) 919; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 920; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 921; GFX7-NEXT: v_mov_b32_e32 v1, 0 922; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 923; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 924; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 925; GFX7-NEXT: s_mov_b32 s2, -1 926; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 927; GFX7-NEXT: s_waitcnt vmcnt(2) 928; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 929; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 930; GFX7-NEXT: s_waitcnt vmcnt(1) 931; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 932; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 933; GFX7-NEXT: s_waitcnt vmcnt(0) 934; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 935; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 936; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 937; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1 938; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 939; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 940; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1 941; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 942; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 943; GFX7-NEXT: s_endpgm 944; 945; GFX8-LABEL: udot4_CommutationAccrossMADs: 946; GFX8: ; %bb.0: ; %entry 947; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 948; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 949; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 950; GFX8-NEXT: s_waitcnt lgkmcnt(0) 951; GFX8-NEXT: v_mov_b32_e32 v1, s1 952; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 953; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 954; GFX8-NEXT: flat_load_dword v3, v[0:1] 955; GFX8-NEXT: v_mov_b32_e32 v1, s3 956; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 957; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 958; GFX8-NEXT: flat_load_dword v2, v[0:1] 959; GFX8-NEXT: v_mov_b32_e32 v0, s4 960; GFX8-NEXT: v_mov_b32_e32 v1, s5 961; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 962; GFX8-NEXT: s_waitcnt vmcnt(2) 963; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 964; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 965; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 966; GFX8-NEXT: s_waitcnt vmcnt(1) 967; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 968; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 969; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 970; GFX8-NEXT: s_waitcnt vmcnt(0) 971; GFX8-NEXT: v_mad_u16 v4, v8, v7, v4 972; GFX8-NEXT: v_mad_u16 v2, v2, v3, v4 973; GFX8-NEXT: v_mad_u16 v2, v6, v5, v2 974; GFX8-NEXT: v_mad_u16 v2, v10, v9, v2 975; GFX8-NEXT: flat_store_byte v[0:1], v2 976; GFX8-NEXT: s_endpgm 977; 978; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: 979; GFX9-NODL: ; %bb.0: ; %entry 980; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 981; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 982; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 983; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 984; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 985; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 986; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 987; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[6:7] 988; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 989; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 990; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 991; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 992; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 993; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 994; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7, v6, v3 995; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 996; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 997; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v2, v1, v3 998; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 999; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 1000; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 1001; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7] 1002; GFX9-NODL-NEXT: s_endpgm 1003; 1004; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: 1005; GFX9-DL: ; %bb.0: ; %entry 1006; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1007; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1008; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1009; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 1010; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1011; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 1012; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] 1013; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7] 1014; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1015; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 1016; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7] 1017; GFX9-DL-NEXT: s_endpgm 1018; 1019; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: 1020; GFX10-DL: ; %bb.0: ; %entry 1021; GFX10-DL-NEXT: s_clause 0x1 1022; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1023; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1024; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1025; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 1026; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1027; GFX10-DL-NEXT: s_clause 0x1 1028; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 1029; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] 1030; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] 1031; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1032; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 1033; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] 1034; GFX10-DL-NEXT: s_endpgm 1035; 1036; GFX11-DL-LABEL: udot4_CommutationAccrossMADs: 1037; GFX11-DL: ; %bb.0: ; %entry 1038; GFX11-DL-NEXT: s_clause 0x1 1039; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1040; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1041; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1042; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 1043; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1044; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1045; GFX11-DL-NEXT: s_clause 0x1 1046; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] 1047; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 1048; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5] 1049; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 1050; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3 1051; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5] 1052; GFX11-DL-NEXT: s_endpgm 1053 ptr addrspace(1) %src2, 1054 ptr addrspace(1) nocapture %dst) { 1055entry: 1056 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1057 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 1058 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 1059 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 1060 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 1061 1062 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1063 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1064 %mul1 = mul nuw nsw i8 %v2e0, %v1e0 1065 1066 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1067 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1068 %mul2 = mul nuw nsw i8 %v2e1, %v1e1 1069 1070 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1071 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1072 %mul3 = mul nuw nsw i8 %v2e2, %v1e2 1073 1074 %v1e3 = extractelement <4 x i8> %vec1, i64 3 1075 %v2e3 = extractelement <4 x i8> %vec2, i64 3 1076 %mul4 = mul nuw nsw i8 %v2e3, %v1e3 1077 1078 %acc = load i8, ptr addrspace(1) %dst, align 2 1079 %mad1 = add i8 %acc, %mul2 1080 %mad2 = add i8 %mad1, %mul1 1081 %mad3 = add i8 %mad2, %mul3 1082 %mad4 = add i8 %mad3, %mul4 1083 1084 store i8 %mad4, ptr addrspace(1) %dst, align 2 1085 ret void 1086} 1087 1088define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, 1089; GFX7-LABEL: udot4_multiuse_mul1: 1090; GFX7: ; %bb.0: ; %entry 1091; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1092; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1093; GFX7-NEXT: s_mov_b32 s3, 0xf000 1094; GFX7-NEXT: s_mov_b32 s6, 0 1095; GFX7-NEXT: s_mov_b32 s7, s3 1096; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1097; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 1098; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1099; GFX7-NEXT: v_mov_b32_e32 v1, 0 1100; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1101; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 1102; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1103; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1104; GFX7-NEXT: s_mov_b32 s2, -1 1105; GFX7-NEXT: s_waitcnt vmcnt(1) 1106; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 1107; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 1108; GFX7-NEXT: s_waitcnt vmcnt(0) 1109; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 1110; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 1111; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1112; GFX7-NEXT: v_mad_u32_u24 v8, v1, v5, s4 1113; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8 1114; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 1115; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 1116; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 1117; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1118; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1119; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 1120; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 1121; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1122; GFX7-NEXT: s_endpgm 1123; 1124; GFX8-LABEL: udot4_multiuse_mul1: 1125; GFX8: ; %bb.0: ; %entry 1126; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1127; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1128; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1129; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1130; GFX8-NEXT: v_mov_b32_e32 v1, s1 1131; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1132; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1133; GFX8-NEXT: flat_load_dword v3, v[0:1] 1134; GFX8-NEXT: v_mov_b32_e32 v1, s3 1135; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1136; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1137; GFX8-NEXT: flat_load_dword v0, v[0:1] 1138; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1139; GFX8-NEXT: s_waitcnt vmcnt(1) 1140; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 1141; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 1142; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 1143; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 1144; GFX8-NEXT: s_waitcnt vmcnt(0) 1145; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 1146; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 1147; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1148; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s0 1149; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v8 1150; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 1151; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 1152; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1153; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 1154; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 1155; GFX8-NEXT: v_mov_b32_e32 v0, s4 1156; GFX8-NEXT: v_mov_b32_e32 v1, s5 1157; GFX8-NEXT: flat_store_dword v[0:1], v2 1158; GFX8-NEXT: s_endpgm 1159; 1160; GFX9-NODL-LABEL: udot4_multiuse_mul1: 1161; GFX9-NODL: ; %bb.0: ; %entry 1162; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1163; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1164; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1165; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1166; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 1167; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 1168; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 1169; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1170; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 1171; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 1172; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1173; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2 1174; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 1175; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1176; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 1177; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 1178; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1179; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 1180; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2 1181; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 1182; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 1183; GFX9-NODL-NEXT: s_endpgm 1184; 1185; GFX9-DL-LABEL: udot4_multiuse_mul1: 1186; GFX9-DL: ; %bb.0: ; %entry 1187; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1188; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1189; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1190; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1191; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1192; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1193; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1194; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1195; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1196; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xff, v1 1197; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1198; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v2 1199; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1200; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 1201; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 1202; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1203; GFX9-DL-NEXT: s_endpgm 1204; 1205; GFX10-DL-LABEL: udot4_multiuse_mul1: 1206; GFX10-DL: ; %bb.0: ; %entry 1207; GFX10-DL-NEXT: s_clause 0x1 1208; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1209; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1210; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1211; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1212; GFX10-DL-NEXT: s_clause 0x1 1213; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 1214; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 1215; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 1216; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1217; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1218; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1 1219; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1220; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2 1221; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1222; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s0 1223; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 1224; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v2, v0 1225; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] 1226; GFX10-DL-NEXT: s_endpgm 1227; 1228; GFX11-DL-LABEL: udot4_multiuse_mul1: 1229; GFX11-DL: ; %bb.0: ; %entry 1230; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1231; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1232; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1233; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 1234; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1235; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1236; GFX11-DL-NEXT: s_clause 0x1 1237; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 1238; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 1239; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 1240; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 1241; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1 1242; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 1243; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xff, v0 1244; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1245; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1246; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, v3, s0 1247; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0 1248; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2 1249; GFX11-DL-NEXT: global_store_b32 v3, v0, s[4:5] 1250; GFX11-DL-NEXT: s_endpgm 1251 ptr addrspace(1) %src2, 1252 ptr addrspace(1) nocapture %dst) { 1253entry: 1254 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1255 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 1256 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 1257 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 1258 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 1259 1260 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1261 %cv1e0 = zext i8 %v1e0 to i32 1262 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1263 %cv2e0 = zext i8 %v2e0 to i32 1264 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 1265 1266 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1267 %cv1e1 = zext i8 %v1e1 to i32 1268 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1269 %cv2e1 = zext i8 %v2e1 to i32 1270 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 1271 1272 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1273 %cv1e2 = zext i8 %v1e2 to i32 1274 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1275 %cv2e2 = zext i8 %v2e2 to i32 1276 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 1277 1278 %v1e3 = extractelement <4 x i8> %vec1, i64 3 1279 %cv1e3 = zext i8 %v1e3 to i32 1280 %v2e3 = extractelement <4 x i8> %vec2, i64 3 1281 %cv2e3 = zext i8 %v2e3 to i32 1282 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 1283 1284 %acc = load i32, ptr addrspace(1) %dst, align 4 1285 %add = add i32 %mul1, %acc 1286 %add1 = add i32 %mul2, %add 1287 %add2 = add i32 %add1, %mul1 1288 %add3 = add i32 %add2, %mul3 1289 %add4 = add i32 %add3, %mul4 1290 1291 store i32 %add4, ptr addrspace(1) %dst, align 4 1292 ret void 1293} 1294 1295define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, 1296; GFX7-LABEL: udot4_multiuse_add1: 1297; GFX7: ; %bb.0: ; %entry 1298; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1299; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1300; GFX7-NEXT: s_mov_b32 s3, 0xf000 1301; GFX7-NEXT: s_mov_b32 s6, 0 1302; GFX7-NEXT: s_mov_b32 s7, s3 1303; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1304; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 1305; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1306; GFX7-NEXT: v_mov_b32_e32 v1, 0 1307; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1308; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 1309; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1310; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1311; GFX7-NEXT: s_mov_b32 s2, -1 1312; GFX7-NEXT: s_waitcnt vmcnt(1) 1313; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 1314; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 1315; GFX7-NEXT: s_waitcnt vmcnt(0) 1316; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 1317; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 1318; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1319; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, s4 1320; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 1321; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 1322; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 1323; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1324; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1325; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 1326; GFX7-NEXT: v_add_i32_e32 v6, vcc, s4, v3 1327; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 1328; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v6 1329; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1330; GFX7-NEXT: s_endpgm 1331; 1332; GFX8-LABEL: udot4_multiuse_add1: 1333; GFX8: ; %bb.0: ; %entry 1334; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1335; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1336; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1337; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1338; GFX8-NEXT: v_mov_b32_e32 v1, s1 1339; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1340; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1341; GFX8-NEXT: flat_load_dword v3, v[0:1] 1342; GFX8-NEXT: v_mov_b32_e32 v1, s3 1343; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1344; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1345; GFX8-NEXT: flat_load_dword v0, v[0:1] 1346; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1347; GFX8-NEXT: s_waitcnt vmcnt(1) 1348; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 1349; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 1350; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 1351; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 1352; GFX8-NEXT: s_waitcnt vmcnt(0) 1353; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 1354; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 1355; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1356; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s0 1357; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 1358; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 1359; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1360; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 1361; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v4 1362; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v1 1363; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v5 1364; GFX8-NEXT: v_mov_b32_e32 v0, s4 1365; GFX8-NEXT: v_mov_b32_e32 v1, s5 1366; GFX8-NEXT: flat_store_dword v[0:1], v2 1367; GFX8-NEXT: s_endpgm 1368; 1369; GFX9-NODL-LABEL: udot4_multiuse_add1: 1370; GFX9-NODL: ; %bb.0: ; %entry 1371; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1372; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1373; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1374; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1375; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 1376; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 1377; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 1378; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1379; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 1380; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8 1381; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1382; GFX9-NODL-NEXT: v_bfe_u32 v5, v2, 8, 8 1383; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 1384; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1385; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 1386; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1387; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s0 1388; GFX9-NODL-NEXT: v_add_u32_e32 v4, s0, v2 1389; GFX9-NODL-NEXT: v_add3_u32 v2, v2, v3, v6 1390; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v1, v4 1391; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 1392; GFX9-NODL-NEXT: s_endpgm 1393; 1394; GFX9-DL-LABEL: udot4_multiuse_add1: 1395; GFX9-DL: ; %bb.0: ; %entry 1396; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1397; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1398; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1399; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1400; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1401; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1402; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1403; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1404; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1405; GFX9-DL-NEXT: s_add_i32 s1, s0, s0 1406; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1407; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 1408; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 1409; GFX9-DL-NEXT: v_add3_u32 v1, s1, v3, v1 1410; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1411; GFX9-DL-NEXT: s_endpgm 1412; 1413; GFX10-DL-LABEL: udot4_multiuse_add1: 1414; GFX10-DL: ; %bb.0: ; %entry 1415; GFX10-DL-NEXT: s_clause 0x1 1416; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1417; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1418; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1419; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1420; GFX10-DL-NEXT: s_clause 0x1 1421; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 1422; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 1423; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 1424; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1425; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1426; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 1427; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1428; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 1429; GFX10-DL-NEXT: s_add_i32 s0, s0, s0 1430; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1431; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1 1432; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 1433; GFX10-DL-NEXT: s_endpgm 1434; 1435; GFX11-DL-LABEL: udot4_multiuse_add1: 1436; GFX11-DL: ; %bb.0: ; %entry 1437; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1438; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1439; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1440; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 1441; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1442; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1443; GFX11-DL-NEXT: s_clause 0x1 1444; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 1445; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 1446; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 1447; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 1448; GFX11-DL-NEXT: v_bfe_u32 v2, v1, 8, 8 1449; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 1450; GFX11-DL-NEXT: v_bfe_u32 v3, v0, 8, 8 1451; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1452; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 1453; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0 1454; GFX11-DL-NEXT: s_add_i32 s0, s0, s0 1455; GFX11-DL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 1456; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 1457; GFX11-DL-NEXT: v_add3_u32 v0, s0, v2, v0 1458; GFX11-DL-NEXT: global_store_b32 v1, v0, s[4:5] 1459; GFX11-DL-NEXT: s_endpgm 1460 ptr addrspace(1) %src2, 1461 ptr addrspace(1) nocapture %dst) { 1462entry: 1463 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1464 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 1465 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 1466 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 1467 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 1468 1469 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1470 %cv1e0 = zext i8 %v1e0 to i32 1471 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1472 %cv2e0 = zext i8 %v2e0 to i32 1473 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 1474 1475 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1476 %cv1e1 = zext i8 %v1e1 to i32 1477 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1478 %cv2e1 = zext i8 %v2e1 to i32 1479 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 1480 1481 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1482 %cv1e2 = zext i8 %v1e2 to i32 1483 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1484 %cv2e2 = zext i8 %v2e2 to i32 1485 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 1486 1487 %v1e3 = extractelement <4 x i8> %vec1, i64 3 1488 %cv1e3 = zext i8 %v1e3 to i32 1489 %v2e3 = extractelement <4 x i8> %vec2, i64 3 1490 %cv2e3 = zext i8 %v2e3 to i32 1491 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 1492 1493 %acc = load i32, ptr addrspace(1) %dst, align 4 1494 %add1 = add i32 %mul2, %acc 1495 %add = add i32 %add1, %acc 1496 %add2 = add i32 %add1, %mul1 1497 %add3 = add i32 %add2, %mul3 1498 %add4 = add i32 %add3, %mul4 1499 %res = add i32 %add4, %add 1500 store i32 %res, ptr addrspace(1) %dst, align 4 1501 ret void 1502} 1503 1504define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, 1505; GFX7-LABEL: notdot4_mixedtypes: 1506; GFX7: ; %bb.0: ; %entry 1507; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1508; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1509; GFX7-NEXT: s_mov_b32 s3, 0xf000 1510; GFX7-NEXT: s_mov_b32 s6, 0 1511; GFX7-NEXT: s_mov_b32 s7, s3 1512; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1513; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 1514; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1515; GFX7-NEXT: v_mov_b32_e32 v1, 0 1516; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1517; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 1518; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1519; GFX7-NEXT: s_mov_b32 s2, -1 1520; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 1521; GFX7-NEXT: s_waitcnt vmcnt(2) 1522; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 1523; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 1524; GFX7-NEXT: s_waitcnt vmcnt(1) 1525; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8 1526; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 1527; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 1528; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 1529; GFX7-NEXT: s_waitcnt vmcnt(0) 1530; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 1531; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 1532; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 1533; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 1534; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1535; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1536; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 1537; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 1538; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 1539; GFX7-NEXT: s_endpgm 1540; 1541; GFX8-LABEL: notdot4_mixedtypes: 1542; GFX8: ; %bb.0: ; %entry 1543; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1544; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1545; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1546; GFX8-NEXT: v_mov_b32_e32 v5, 0xff 1547; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1548; GFX8-NEXT: v_mov_b32_e32 v1, s1 1549; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1550; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1551; GFX8-NEXT: flat_load_dword v3, v[0:1] 1552; GFX8-NEXT: v_mov_b32_e32 v1, s3 1553; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1554; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1555; GFX8-NEXT: flat_load_dword v2, v[0:1] 1556; GFX8-NEXT: v_mov_b32_e32 v0, s4 1557; GFX8-NEXT: v_mov_b32_e32 v1, s5 1558; GFX8-NEXT: flat_load_ushort v4, v[0:1] 1559; GFX8-NEXT: s_waitcnt vmcnt(2) 1560; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 1561; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 1562; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8 1563; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1564; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 1565; GFX8-NEXT: s_waitcnt vmcnt(1) 1566; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 1567; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9 1568; GFX8-NEXT: v_bfe_i32 v7, v2, 0, 8 1569; GFX8-NEXT: s_waitcnt vmcnt(0) 1570; GFX8-NEXT: v_mad_u16 v4, v8, v9, v4 1571; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1572; GFX8-NEXT: v_mad_u16 v4, v6, v7, v4 1573; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1574; GFX8-NEXT: v_mad_u16 v4, v10, v5, v4 1575; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 1576; GFX8-NEXT: flat_store_short v[0:1], v2 1577; GFX8-NEXT: s_endpgm 1578; 1579; GFX9-NODL-LABEL: notdot4_mixedtypes: 1580; GFX9-NODL: ; %bb.0: ; %entry 1581; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1582; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1583; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1584; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1585; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 1586; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 1587; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1588; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[6:7] 1589; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff 1590; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 1591; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 1592; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 1593; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 1594; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 1595; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7 1596; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8 1597; GFX9-NODL-NEXT: v_bfe_i32 v5, v2, 0, 8 1598; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1599; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 1600; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1601; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1602; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 1603; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 1604; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1605; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 1606; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1607; GFX9-NODL-NEXT: global_store_short v0, v1, s[6:7] 1608; GFX9-NODL-NEXT: s_endpgm 1609; 1610; GFX9-DL-LABEL: notdot4_mixedtypes: 1611; GFX9-DL: ; %bb.0: ; %entry 1612; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1613; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1614; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1615; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1616; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1617; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1618; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1619; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] 1620; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0302 1621; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 1622; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 1623; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1624; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 1625; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 1626; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xff, v7 1627; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 1628; GFX9-DL-NEXT: v_bfe_i32 v5, v2, 0, 8 1629; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1630; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 1631; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0 1632; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 1633; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 1634; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 1635; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] 1636; GFX9-DL-NEXT: s_endpgm 1637; 1638; GFX10-DL-LABEL: notdot4_mixedtypes: 1639; GFX10-DL: ; %bb.0: ; %entry 1640; GFX10-DL-NEXT: s_clause 0x1 1641; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1642; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1643; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1644; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1645; GFX10-DL-NEXT: s_clause 0x1 1646; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 1647; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 1648; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1649; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] 1650; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 1651; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 1652; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1653; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 1654; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8 1655; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8 1656; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0302 1657; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v4 1658; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 1659; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0302 1660; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1661; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 1662; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3 1663; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 1664; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 1665; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7] 1666; GFX10-DL-NEXT: s_endpgm 1667; 1668; GFX11-DL-LABEL: notdot4_mixedtypes: 1669; GFX11-DL: ; %bb.0: ; %entry 1670; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1671; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1672; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1673; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 1674; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1675; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1676; GFX11-DL-NEXT: s_clause 0x1 1677; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 1678; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 1679; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 1680; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 1681; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 1682; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0 1683; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 1684; GFX11-DL-NEXT: v_bfe_i32 v6, v1, 0, 8 1685; GFX11-DL-NEXT: v_bfe_i32 v7, v0, 0, 8 1686; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v4 1687; GFX11-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 1688; GFX11-DL-NEXT: global_load_u16 v3, v2, s[4:5] 1689; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0302 1690; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0302 1691; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 1692; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 1693; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1694; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 1695; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 1696; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 1697; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3 1698; GFX11-DL-NEXT: global_store_b16 v2, v0, s[4:5] 1699; GFX11-DL-NEXT: s_endpgm 1700 ptr addrspace(1) %src2, 1701 ptr addrspace(1) nocapture %dst) { 1702entry: 1703 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1704 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 1705 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 1706 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 1707 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 1708 1709 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1710 %cv1e0 = sext i8 %v1e0 to i16 1711 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1712 %cv2e0 = sext i8 %v2e0 to i16 1713 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0 1714 1715 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1716 %cv1e1 = zext i8 %v1e1 to i16 1717 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1718 %cv2e1 = zext i8 %v2e1 to i16 1719 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1 1720 1721 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1722 %cv1e2 = zext i8 %v1e2 to i16 1723 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1724 %cv2e2 = zext i8 %v2e2 to i16 1725 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2 1726 1727 %v1e3 = extractelement <4 x i8> %vec1, i64 3 1728 %cv1e3 = zext i8 %v1e3 to i16 1729 %v2e3 = extractelement <4 x i8> %vec2, i64 3 1730 %cv2e3 = zext i8 %v2e3 to i16 1731 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3 1732 1733 %acc = load i16, ptr addrspace(1) %dst, align 2 1734 %add1 = add i16 %mul2, %acc 1735 %add2 = add i16 %add1, %mul1 1736 %add3 = add i16 %add2, %mul3 1737 %add4 = add i16 %add3, %mul4 1738 1739 store i16 %add4, ptr addrspace(1) %dst, align 2 1740 ret void 1741} 1742 1743 1744define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, 1745; GFX7-LABEL: notdot4_mixedtypes2: 1746; GFX7: ; %bb.0: ; %entry 1747; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1748; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1749; GFX7-NEXT: s_mov_b32 s3, 0xf000 1750; GFX7-NEXT: s_mov_b32 s6, 0 1751; GFX7-NEXT: s_mov_b32 s7, s3 1752; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1753; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 1754; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1755; GFX7-NEXT: v_mov_b32_e32 v1, 0 1756; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1757; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 1758; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1759; GFX7-NEXT: s_mov_b32 s2, -1 1760; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 1761; GFX7-NEXT: s_waitcnt vmcnt(2) 1762; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 1763; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 1764; GFX7-NEXT: s_waitcnt vmcnt(1) 1765; GFX7-NEXT: v_bfe_i32 v7, v0, 8, 8 1766; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 1767; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8 1768; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 1769; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 1770; GFX7-NEXT: s_waitcnt vmcnt(0) 1771; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 1772; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 1773; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 1774; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 1775; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 1776; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1777; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 1778; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 1779; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 1780; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 1781; GFX7-NEXT: s_endpgm 1782; 1783; GFX8-LABEL: notdot4_mixedtypes2: 1784; GFX8: ; %bb.0: ; %entry 1785; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1786; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1787; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1788; GFX8-NEXT: v_mov_b32_e32 v5, 0xff 1789; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1790; GFX8-NEXT: v_mov_b32_e32 v1, s1 1791; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1792; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1793; GFX8-NEXT: flat_load_dword v3, v[0:1] 1794; GFX8-NEXT: v_mov_b32_e32 v1, s3 1795; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1796; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1797; GFX8-NEXT: flat_load_dword v2, v[0:1] 1798; GFX8-NEXT: v_mov_b32_e32 v0, s4 1799; GFX8-NEXT: v_mov_b32_e32 v1, s5 1800; GFX8-NEXT: flat_load_ushort v4, v[0:1] 1801; GFX8-NEXT: s_waitcnt vmcnt(2) 1802; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 1803; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9 1804; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v3 1805; GFX8-NEXT: v_bfe_i32 v7, v3, 0, 8 1806; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 8 1807; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 1808; GFX8-NEXT: s_waitcnt vmcnt(1) 1809; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v2 1810; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 8 1811; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v2 1812; GFX8-NEXT: s_waitcnt vmcnt(0) 1813; GFX8-NEXT: v_mad_u16 v4, v9, v10, v4 1814; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1815; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1816; GFX8-NEXT: v_mad_u16 v4, v7, v8, v4 1817; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 1818; GFX8-NEXT: v_mad_u16 v4, v6, v5, v4 1819; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 1820; GFX8-NEXT: flat_store_short v[0:1], v2 1821; GFX8-NEXT: s_endpgm 1822; 1823; GFX9-NODL-LABEL: notdot4_mixedtypes2: 1824; GFX9-NODL: ; %bb.0: ; %entry 1825; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1826; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1827; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1828; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1829; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 1830; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 1831; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1832; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[6:7] 1833; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff 1834; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 1835; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 1836; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 1837; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 1838; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7 1839; GFX9-NODL-NEXT: v_bfe_i32 v8, v8, 0, 8 1840; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 1841; GFX9-NODL-NEXT: v_bfe_i32 v5, v1, 0, 8 1842; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v2 1843; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1844; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7, v8, v3 1845; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1846; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1847; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 1848; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v5, v6, v3 1849; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 1850; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 1851; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 1852; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1853; GFX9-NODL-NEXT: global_store_short v0, v1, s[6:7] 1854; GFX9-NODL-NEXT: s_endpgm 1855; 1856; GFX9-DL-LABEL: notdot4_mixedtypes2: 1857; GFX9-DL: ; %bb.0: ; %entry 1858; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1859; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1860; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1861; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1862; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1863; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1864; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1865; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] 1866; GFX9-DL-NEXT: s_movk_i32 s0, 0xff 1867; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 1868; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 1869; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1870; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 1871; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xff, v7 1872; GFX9-DL-NEXT: v_bfe_i32 v8, v8, 0, 8 1873; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 1874; GFX9-DL-NEXT: v_bfe_i32 v5, v1, 0, 8 1875; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v2 1876; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1877; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v8, v3 1878; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1879; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1880; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 1881; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v5, v6, v3 1882; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 1883; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 1884; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 1885; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1886; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] 1887; GFX9-DL-NEXT: s_endpgm 1888; 1889; GFX10-DL-LABEL: notdot4_mixedtypes2: 1890; GFX10-DL: ; %bb.0: ; %entry 1891; GFX10-DL-NEXT: s_clause 0x1 1892; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1893; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1894; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1895; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff 1896; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1897; GFX10-DL-NEXT: s_clause 0x1 1898; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 1899; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 1900; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1901; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] 1902; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 1903; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 1904; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1905; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 1906; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 1907; GFX10-DL-NEXT: v_bfe_i32 v7, v1, 0, 8 1908; GFX10-DL-NEXT: v_and_b32_e32 v9, 0xff, v2 1909; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v4 1910; GFX10-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 1911; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 1912; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1913; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 1914; GFX10-DL-NEXT: v_bfe_i32 v4, v6, 0, 8 1915; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1916; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1917; GFX10-DL-NEXT: v_mad_u16 v3, v7, v9, v3 1918; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 1919; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 1920; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 1921; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7] 1922; GFX10-DL-NEXT: s_endpgm 1923; 1924; GFX11-DL-LABEL: notdot4_mixedtypes2: 1925; GFX11-DL: ; %bb.0: ; %entry 1926; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1927; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1928; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1929; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 1930; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1931; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1932; GFX11-DL-NEXT: s_clause 0x1 1933; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 1934; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 1935; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 1936; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 1937; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 1938; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 1939; GFX11-DL-NEXT: v_and_b32_e32 v9, 0xff, v0 1940; GFX11-DL-NEXT: global_load_u16 v3, v2, s[4:5] 1941; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0 1942; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 1943; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v4 1944; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v0 1945; GFX11-DL-NEXT: v_bfe_i32 v8, v1, 0, 8 1946; GFX11-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 1947; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1948; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 1949; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) 1950; GFX11-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 1951; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 1952; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 1953; GFX11-DL-NEXT: v_bfe_i32 v4, v6, 0, 8 1954; GFX11-DL-NEXT: v_and_b32_e32 v5, 0xff, v7 1955; GFX11-DL-NEXT: v_mad_u16 v3, v8, v9, v3 1956; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1957; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 1958; GFX11-DL-NEXT: v_mad_u16 v0, v1, v0, v3 1959; GFX11-DL-NEXT: global_store_b16 v2, v0, s[4:5] 1960; GFX11-DL-NEXT: s_endpgm 1961 ptr addrspace(1) %src2, 1962 ptr addrspace(1) nocapture %dst) { 1963entry: 1964 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1965 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 1966 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 1967 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 1968 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 1969 1970 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1971 %cv1e0 = sext i8 %v1e0 to i16 1972 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1973 %cv2e0 = zext i8 %v2e0 to i16 1974 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0 1975 1976 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1977 %cv1e1 = zext i8 %v1e1 to i16 1978 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1979 %cv2e1 = sext i8 %v2e1 to i16 1980 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1 1981 1982 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1983 %cv1e2 = sext i8 %v1e2 to i16 1984 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1985 %cv2e2 = zext i8 %v2e2 to i16 1986 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2 1987 1988 %v1e3 = extractelement <4 x i8> %vec1, i64 3 1989 %cv1e3 = zext i8 %v1e3 to i16 1990 %v2e3 = extractelement <4 x i8> %vec2, i64 3 1991 %cv2e3 = sext i8 %v2e3 to i16 1992 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3 1993 1994 %acc = load i16, ptr addrspace(1) %dst, align 2 1995 %add1 = add i16 %mul2, %acc 1996 %add2 = add i16 %add1, %mul1 1997 %add3 = add i16 %add2, %mul3 1998 %add4 = add i16 %add3, %mul4 1999 2000 store i16 %add4, ptr addrspace(1) %dst, align 2 2001 ret void 2002} 2003 2004; TODO: cleanup s_lshr_b32 2005define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, 2006; GFX7-LABEL: udot4_acc32_vecMul: 2007; GFX7: ; %bb.0: ; %entry 2008; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2009; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2010; GFX7-NEXT: s_mov_b32 s3, 0xf000 2011; GFX7-NEXT: s_mov_b32 s6, 0 2012; GFX7-NEXT: s_mov_b32 s7, s3 2013; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2014; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2015; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2016; GFX7-NEXT: v_mov_b32_e32 v1, 0 2017; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2018; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 2019; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2020; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 2021; GFX7-NEXT: s_mov_b32 s2, -1 2022; GFX7-NEXT: s_waitcnt vmcnt(1) 2023; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2 2024; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 2025; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 2026; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 2027; GFX7-NEXT: s_waitcnt vmcnt(0) 2028; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 2029; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 2030; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 2031; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 2032; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2033; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4 2034; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 2035; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0 2036; GFX7-NEXT: v_mad_u32_u24 v0, v1, v5, v0 2037; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2038; GFX7-NEXT: s_endpgm 2039; 2040; GFX8-LABEL: udot4_acc32_vecMul: 2041; GFX8: ; %bb.0: ; %entry 2042; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2043; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2044; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2045; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2046; GFX8-NEXT: v_mov_b32_e32 v1, s1 2047; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2048; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2049; GFX8-NEXT: flat_load_dword v3, v[0:1] 2050; GFX8-NEXT: v_mov_b32_e32 v1, s3 2051; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2052; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2053; GFX8-NEXT: flat_load_dword v0, v[0:1] 2054; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 2055; GFX8-NEXT: s_waitcnt vmcnt(1) 2056; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3 2057; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8 2058; GFX8-NEXT: v_lshrrev_b16_e32 v5, 8, v3 2059; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v3 2060; GFX8-NEXT: s_waitcnt vmcnt(0) 2061; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 2062; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8 2063; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v0 2064; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 2065; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2066; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0 2067; GFX8-NEXT: v_mad_u32_u24 v0, v5, v7, v0 2068; GFX8-NEXT: v_mad_u32_u24 v0, v4, v6, v0 2069; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0 2070; GFX8-NEXT: v_mov_b32_e32 v0, s4 2071; GFX8-NEXT: v_mov_b32_e32 v1, s5 2072; GFX8-NEXT: flat_store_dword v[0:1], v2 2073; GFX8-NEXT: s_endpgm 2074; 2075; GFX9-NODL-LABEL: udot4_acc32_vecMul: 2076; GFX9-NODL: ; %bb.0: ; %entry 2077; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2078; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2079; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2080; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2081; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 2082; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 2083; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 2084; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2085; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2086; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2087; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2088; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 2089; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 2090; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2091; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 2092; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 2093; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 2094; GFX9-NODL-NEXT: s_endpgm 2095; 2096; GFX9-DL-LABEL: udot4_acc32_vecMul: 2097; GFX9-DL: ; %bb.0: ; %entry 2098; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2099; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2100; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2101; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2102; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 2103; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 2104; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2105; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2106; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2107; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 2108; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 2109; GFX9-DL-NEXT: s_endpgm 2110; 2111; GFX10-DL-LABEL: udot4_acc32_vecMul: 2112; GFX10-DL: ; %bb.0: ; %entry 2113; GFX10-DL-NEXT: s_clause 0x1 2114; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2115; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2116; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2117; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2118; GFX10-DL-NEXT: s_clause 0x1 2119; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 2120; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 2121; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 2122; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2123; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2124; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2125; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 2126; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] 2127; GFX10-DL-NEXT: s_endpgm 2128; 2129; GFX11-DL-LABEL: udot4_acc32_vecMul: 2130; GFX11-DL: ; %bb.0: ; %entry 2131; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2132; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2133; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2134; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 2135; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 2136; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2137; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2138; GFX11-DL-NEXT: s_clause 0x1 2139; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 2140; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 2141; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 2142; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2143; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 2144; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 2145; GFX11-DL-NEXT: s_endpgm 2146 ptr addrspace(1) %src2, 2147 ptr addrspace(1) nocapture %dst) { 2148entry: 2149 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2150 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 2151 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 2152 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 2153 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 2154 2155 %cvec1 = zext <4 x i8> %vec1 to <4 x i32> 2156 %cvec2 = zext <4 x i8> %vec2 to <4 x i32> 2157 2158 %mul = mul <4 x i32> %cvec1, %cvec2 2159 %mul0 = extractelement <4 x i32> %mul, i64 0 2160 %mul1 = extractelement <4 x i32> %mul, i64 1 2161 %mul2 = extractelement <4 x i32> %mul, i64 2 2162 %mul3 = extractelement <4 x i32> %mul, i64 3 2163 2164 %acc = load i32, ptr addrspace(1) %dst, align 4 2165 %add1 = add i32 %mul0, %acc 2166 %add2 = add i32 %add1, %mul1 2167 %add3 = add i32 %add2, %mul2 2168 %add4 = add i32 %add3, %mul3 2169 2170 store i32 %add4, ptr addrspace(1) %dst, align 4 2171 ret void 2172} 2173 2174; TODO: This pattern should be recognized. 2175define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, 2176; GFX7-LABEL: udot4_acc16_vecMul: 2177; GFX7: ; %bb.0: ; %entry 2178; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2179; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2180; GFX7-NEXT: s_mov_b32 s3, 0xf000 2181; GFX7-NEXT: s_mov_b32 s6, 0 2182; GFX7-NEXT: s_mov_b32 s7, s3 2183; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2184; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2185; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2186; GFX7-NEXT: v_mov_b32_e32 v1, 0 2187; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2188; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 2189; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2190; GFX7-NEXT: s_mov_b32 s2, -1 2191; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 2192; GFX7-NEXT: s_waitcnt vmcnt(2) 2193; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 2194; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v2 2195; GFX7-NEXT: s_waitcnt vmcnt(1) 2196; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 2197; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 2198; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 2199; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 2200; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 2201; GFX7-NEXT: v_alignbit_b32 v0, v6, v0, 16 2202; GFX7-NEXT: s_waitcnt vmcnt(0) 2203; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1 2204; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v2 2205; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 2206; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 2207; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 2208; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1 2209; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2210; GFX7-NEXT: v_mad_u32_u24 v0, v6, v5, v0 2211; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 2212; GFX7-NEXT: s_endpgm 2213; 2214; GFX8-LABEL: udot4_acc16_vecMul: 2215; GFX8: ; %bb.0: ; %entry 2216; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2217; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2218; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2219; GFX8-NEXT: v_mov_b32_e32 v5, 0xff 2220; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2221; GFX8-NEXT: v_mov_b32_e32 v1, s1 2222; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2223; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2224; GFX8-NEXT: flat_load_dword v3, v[0:1] 2225; GFX8-NEXT: v_mov_b32_e32 v1, s3 2226; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2227; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2228; GFX8-NEXT: flat_load_dword v2, v[0:1] 2229; GFX8-NEXT: v_mov_b32_e32 v0, s4 2230; GFX8-NEXT: v_mov_b32_e32 v1, s5 2231; GFX8-NEXT: flat_load_ushort v4, v[0:1] 2232; GFX8-NEXT: s_waitcnt vmcnt(2) 2233; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 2234; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v3 2235; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2236; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v3 2237; GFX8-NEXT: s_waitcnt vmcnt(1) 2238; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v2 2239; GFX8-NEXT: v_lshrrev_b16_e32 v9, 8, v2 2240; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2241; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 2242; GFX8-NEXT: s_waitcnt vmcnt(0) 2243; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 2244; GFX8-NEXT: v_mad_u16 v2, v7, v9, v2 2245; GFX8-NEXT: v_mad_u16 v2, v10, v5, v2 2246; GFX8-NEXT: v_mad_u16 v2, v6, v8, v2 2247; GFX8-NEXT: flat_store_short v[0:1], v2 2248; GFX8-NEXT: s_endpgm 2249; 2250; GFX9-NODL-LABEL: udot4_acc16_vecMul: 2251; GFX9-NODL: ; %bb.0: ; %entry 2252; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2253; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2254; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2255; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2256; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 2257; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 2258; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2259; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[6:7] 2260; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff 2261; GFX9-NODL-NEXT: s_mov_b32 s1, 0x5040100 2262; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 2263; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 2264; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 2265; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2266; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 2267; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 2268; GFX9-NODL-NEXT: v_and_b32_e32 v8, 0xff, v1 2269; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2270; GFX9-NODL-NEXT: v_and_b32_e32 v9, 0xff, v2 2271; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2272; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s1 2273; GFX9-NODL-NEXT: v_perm_b32 v1, v5, v1, s1 2274; GFX9-NODL-NEXT: v_perm_b32 v5, v6, v9, s1 2275; GFX9-NODL-NEXT: v_perm_b32 v4, v4, v8, s1 2276; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2277; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 2278; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2279; GFX9-NODL-NEXT: v_add_u16_e32 v3, v2, v3 2280; GFX9-NODL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2281; GFX9-NODL-NEXT: v_add_u16_e32 v2, v2, v1 2282; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2283; GFX9-NODL-NEXT: global_store_short v0, v1, s[6:7] 2284; GFX9-NODL-NEXT: s_endpgm 2285; 2286; GFX9-DL-LABEL: udot4_acc16_vecMul: 2287; GFX9-DL: ; %bb.0: ; %entry 2288; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2289; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2290; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2291; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2292; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 2293; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 2294; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2295; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] 2296; GFX9-DL-NEXT: s_movk_i32 s0, 0xff 2297; GFX9-DL-NEXT: s_mov_b32 s1, 0x5040100 2298; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 2299; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 2300; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 2301; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2302; GFX9-DL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 2303; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 2304; GFX9-DL-NEXT: v_and_b32_e32 v8, 0xff, v1 2305; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2306; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xff, v2 2307; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2308; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s1 2309; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s1 2310; GFX9-DL-NEXT: v_perm_b32 v5, v6, v9, s1 2311; GFX9-DL-NEXT: v_perm_b32 v4, v4, v8, s1 2312; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2313; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 2314; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2315; GFX9-DL-NEXT: v_add_u16_e32 v3, v2, v3 2316; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2317; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 2318; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2319; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] 2320; GFX9-DL-NEXT: s_endpgm 2321; 2322; GFX10-DL-LABEL: udot4_acc16_vecMul: 2323; GFX10-DL: ; %bb.0: ; %entry 2324; GFX10-DL-NEXT: s_clause 0x1 2325; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2326; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2327; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2328; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff 2329; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2330; GFX10-DL-NEXT: s_clause 0x1 2331; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 2332; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 2333; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2334; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] 2335; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 2336; GFX10-DL-NEXT: v_lshrrev_b16 v4, 8, v1 2337; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2338; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v2 2339; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xff, v2 2340; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v1 2341; GFX10-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100 2342; GFX10-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100 2343; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 2344; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 2345; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2346; GFX10-DL-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2347; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 2348; GFX10-DL-NEXT: v_perm_b32 v2, v7, v2, 0x5040100 2349; GFX10-DL-NEXT: v_perm_b32 v1, v6, v1, 0x5040100 2350; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 2351; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2352; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 2353; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2354; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 2355; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2356; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 2357; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 2358; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7] 2359; GFX10-DL-NEXT: s_endpgm 2360; 2361; GFX11-DL-LABEL: udot4_acc16_vecMul: 2362; GFX11-DL: ; %bb.0: ; %entry 2363; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2364; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2365; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2366; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 2367; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2368; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2369; GFX11-DL-NEXT: s_clause 0x1 2370; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 2371; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 2372; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 2373; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v7, 0xff, v1 2374; GFX11-DL-NEXT: global_load_u16 v3, v2, s[4:5] 2375; GFX11-DL-NEXT: v_lshrrev_b16 v4, 8, v1 2376; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 2377; GFX11-DL-NEXT: v_lshrrev_b16 v5, 8, v0 2378; GFX11-DL-NEXT: v_and_b32_e32 v6, 0xff, v0 2379; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v1 2380; GFX11-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v0 2381; GFX11-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100 2382; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 2383; GFX11-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100 2384; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 2385; GFX11-DL-NEXT: v_and_b32_e32 v6, 0xff, v9 2386; GFX11-DL-NEXT: v_and_b32_e32 v7, 0xff, v8 2387; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 2388; GFX11-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 2389; GFX11-DL-NEXT: v_perm_b32 v0, v0, v6, 0x5040100 2390; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2391; GFX11-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 2392; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 2393; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) 2394; GFX11-DL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 2395; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 2396; GFX11-DL-NEXT: v_add_nc_u16 v3, v4, v3 2397; GFX11-DL-NEXT: v_add_nc_u16 v1, v3, v5 2398; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2399; GFX11-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 2400; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0 2401; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 2402; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3 2403; GFX11-DL-NEXT: global_store_b16 v2, v0, s[4:5] 2404; GFX11-DL-NEXT: s_endpgm 2405 ptr addrspace(1) %src2, 2406 ptr addrspace(1) nocapture %dst) { 2407entry: 2408 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2409 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 2410 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 2411 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 2412 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 2413 2414 %cvec1 = zext <4 x i8> %vec1 to <4 x i16> 2415 %cvec2 = zext <4 x i8> %vec2 to <4 x i16> 2416 2417 %mul = mul <4 x i16> %cvec1, %cvec2 2418 %mul0 = extractelement <4 x i16> %mul, i64 0 2419 %mul1 = extractelement <4 x i16> %mul, i64 1 2420 %mul2 = extractelement <4 x i16> %mul, i64 2 2421 %mul3 = extractelement <4 x i16> %mul, i64 3 2422 2423 %acc = load i16, ptr addrspace(1) %dst, align 4 2424 %add1 = add i16 %mul0, %acc 2425 %add2 = add i16 %add1, %mul1 2426 %add3 = add i16 %add2, %mul2 2427 %add4 = add i16 %add3, %mul3 2428 2429 store i16 %add4, ptr addrspace(1) %dst, align 4 2430 ret void 2431} 2432 2433; TODO: Support this pattern. 2434define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, 2435; GFX7-LABEL: udot4_acc8_vecMul: 2436; GFX7: ; %bb.0: ; %entry 2437; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2438; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2439; GFX7-NEXT: s_mov_b32 s3, 0xf000 2440; GFX7-NEXT: s_mov_b32 s6, 0 2441; GFX7-NEXT: s_mov_b32 s7, s3 2442; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2443; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2444; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2445; GFX7-NEXT: v_mov_b32_e32 v1, 0 2446; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2447; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 2448; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2449; GFX7-NEXT: s_mov_b32 s2, -1 2450; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 2451; GFX7-NEXT: s_waitcnt vmcnt(2) 2452; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2 2453; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 2454; GFX7-NEXT: s_waitcnt vmcnt(1) 2455; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 2456; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 2457; GFX7-NEXT: s_waitcnt vmcnt(0) 2458; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 2459; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 2460; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 2461; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 2462; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 2463; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 2464; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2465; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 2466; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 2467; GFX7-NEXT: s_endpgm 2468; 2469; GFX8-LABEL: udot4_acc8_vecMul: 2470; GFX8: ; %bb.0: ; %entry 2471; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2472; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2473; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2474; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2475; GFX8-NEXT: v_mov_b32_e32 v1, s1 2476; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2477; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2478; GFX8-NEXT: flat_load_dword v3, v[0:1] 2479; GFX8-NEXT: v_mov_b32_e32 v1, s3 2480; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2481; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2482; GFX8-NEXT: flat_load_dword v2, v[0:1] 2483; GFX8-NEXT: v_mov_b32_e32 v0, s4 2484; GFX8-NEXT: v_mov_b32_e32 v1, s5 2485; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 2486; GFX8-NEXT: s_waitcnt vmcnt(2) 2487; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2488; GFX8-NEXT: s_waitcnt vmcnt(1) 2489; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 2490; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 2491; GFX8-NEXT: v_mul_lo_u16_e32 v9, v5, v6 2492; GFX8-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2493; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2494; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v7 2495; GFX8-NEXT: v_or_b32_e32 v8, v8, v9 2496; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v8 2497; GFX8-NEXT: s_waitcnt vmcnt(0) 2498; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 2499; GFX8-NEXT: v_add_u16_e32 v2, v2, v8 2500; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v7 2501; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 2502; GFX8-NEXT: v_add_u16_e32 v2, v2, v7 2503; GFX8-NEXT: flat_store_byte v[0:1], v2 2504; GFX8-NEXT: s_endpgm 2505; 2506; GFX9-NODL-LABEL: udot4_acc8_vecMul: 2507; GFX9-NODL: ; %bb.0: ; %entry 2508; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2509; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2510; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2511; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2512; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 2513; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 2514; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2515; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[6:7] 2516; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 2517; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 2518; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2519; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 2520; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 2521; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 2522; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2523; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 8, v6 2524; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2525; GFX9-NODL-NEXT: v_or_b32_e32 v6, v7, v6 2526; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 2527; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2528; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 2529; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6 2530; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 2531; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v9 2532; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7] 2533; GFX9-NODL-NEXT: s_endpgm 2534; 2535; GFX9-DL-LABEL: udot4_acc8_vecMul: 2536; GFX9-DL: ; %bb.0: ; %entry 2537; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2538; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2539; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2540; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2541; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 2542; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 2543; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2544; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] 2545; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 2546; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 2547; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2548; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 2549; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 2550; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 2551; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2552; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v6 2553; GFX9-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2554; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6 2555; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 2556; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2557; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 2558; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 2559; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 2560; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v9 2561; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] 2562; GFX9-DL-NEXT: s_endpgm 2563; 2564; GFX10-DL-LABEL: udot4_acc8_vecMul: 2565; GFX10-DL: ; %bb.0: ; %entry 2566; GFX10-DL-NEXT: s_clause 0x1 2567; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2568; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2569; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2570; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2571; GFX10-DL-NEXT: s_clause 0x1 2572; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 2573; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 2574; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2575; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] 2576; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 2577; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1 2578; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2579; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v2 2580; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v1 2581; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 2582; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v2 2583; GFX10-DL-NEXT: v_lshrrev_b16 v9, 8, v2 2584; GFX10-DL-NEXT: v_mul_lo_u16 v4, v4, v5 2585; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2586; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 2587; GFX10-DL-NEXT: v_mul_lo_u16 v5, v7, v8 2588; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v9 2589; GFX10-DL-NEXT: v_lshlrev_b16 v4, 8, v4 2590; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6 2591; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2592; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4 2593; GFX10-DL-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2594; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5 2595; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5 2596; GFX10-DL-NEXT: v_mad_u16 v1, v7, v8, v1 2597; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 2598; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7] 2599; GFX10-DL-NEXT: s_endpgm 2600; 2601; GFX11-DL-LABEL: udot4_acc8_vecMul: 2602; GFX11-DL: ; %bb.0: ; %entry 2603; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2604; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2605; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2606; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 2607; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 2608; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2609; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2610; GFX11-DL-NEXT: s_clause 0x1 2611; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 2612; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 2613; GFX11-DL-NEXT: global_load_u8 v3, v2, s[4:5] 2614; GFX11-DL-NEXT: s_waitcnt vmcnt(2) 2615; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 2616; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 2617; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 2618; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v0 2619; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v0 2620; GFX11-DL-NEXT: v_lshrrev_b16 v8, 8, v1 2621; GFX11-DL-NEXT: v_lshrrev_b16 v9, 8, v0 2622; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 2623; GFX11-DL-NEXT: v_mad_u16 v0, v1, v0, v3 2624; GFX11-DL-NEXT: v_mul_lo_u16 v5, v5, v6 2625; GFX11-DL-NEXT: v_mul_lo_u16 v6, v4, v7 2626; GFX11-DL-NEXT: v_mul_lo_u16 v8, v8, v9 2627; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2628; GFX11-DL-NEXT: v_lshlrev_b16 v5, 8, v5 2629; GFX11-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 2630; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2631; GFX11-DL-NEXT: v_lshlrev_b16 v8, 8, v8 2632; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v5 2633; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2634; GFX11-DL-NEXT: v_or_b32_e32 v6, v6, v5 2635; GFX11-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8 2636; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 2637; GFX11-DL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 2638; GFX11-DL-NEXT: v_or_b32_e32 v6, v8, v6 2639; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2640; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 2641; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v6 2642; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2643; GFX11-DL-NEXT: v_mad_u16 v0, v4, v7, v0 2644; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v1 2645; GFX11-DL-NEXT: global_store_b8 v2, v0, s[4:5] 2646; GFX11-DL-NEXT: s_endpgm 2647 ptr addrspace(1) %src2, 2648 ptr addrspace(1) nocapture %dst) { 2649entry: 2650 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2651 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 2652 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 2653 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 2654 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 2655 2656 %mul = mul <4 x i8> %vec1, %vec2 2657 %mul0 = extractelement <4 x i8> %mul, i64 0 2658 %mul1 = extractelement <4 x i8> %mul, i64 1 2659 %mul2 = extractelement <4 x i8> %mul, i64 2 2660 %mul3 = extractelement <4 x i8> %mul, i64 3 2661 2662 %acc = load i8, ptr addrspace(1) %dst, align 4 2663 %add1 = add i8 %mul0, %acc 2664 %add2 = add i8 %add1, %mul1 2665 %add3 = add i8 %add2, %mul2 2666 %add4 = add i8 %add3, %mul3 2667 2668 store i8 %add4, ptr addrspace(1) %dst, align 4 2669 ret void 2670} 2671 2672define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, 2673; GFX7-LABEL: idot4_acc32_2ele: 2674; GFX7: ; %bb.0: ; %entry 2675; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2676; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2677; GFX7-NEXT: s_mov_b32 s7, 0xf000 2678; GFX7-NEXT: s_mov_b32 s10, 0 2679; GFX7-NEXT: s_mov_b32 s11, s7 2680; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2681; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 2682; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2683; GFX7-NEXT: v_mov_b32_e32 v1, 0 2684; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2685; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 2686; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2687; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 2688; GFX7-NEXT: s_mov_b32 s6, -1 2689; GFX7-NEXT: s_waitcnt vmcnt(1) 2690; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 2691; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 2692; GFX7-NEXT: s_waitcnt vmcnt(0) 2693; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v0 2694; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 2695; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2696; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s0 2697; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2698; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 2699; GFX7-NEXT: s_endpgm 2700; 2701; GFX8-LABEL: idot4_acc32_2ele: 2702; GFX8: ; %bb.0: ; %entry 2703; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2704; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2705; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2706; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2707; GFX8-NEXT: v_mov_b32_e32 v1, s1 2708; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2709; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2710; GFX8-NEXT: flat_load_dword v3, v[0:1] 2711; GFX8-NEXT: v_mov_b32_e32 v1, s3 2712; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2713; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2714; GFX8-NEXT: flat_load_dword v0, v[0:1] 2715; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 2716; GFX8-NEXT: s_waitcnt vmcnt(1) 2717; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 2718; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8 2719; GFX8-NEXT: s_waitcnt vmcnt(0) 2720; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 2721; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8 2722; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2723; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 2724; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 2725; GFX8-NEXT: v_mov_b32_e32 v0, s4 2726; GFX8-NEXT: v_mov_b32_e32 v1, s5 2727; GFX8-NEXT: flat_store_dword v[0:1], v2 2728; GFX8-NEXT: s_endpgm 2729; 2730; GFX9-NODL-LABEL: idot4_acc32_2ele: 2731; GFX9-NODL: ; %bb.0: ; %entry 2732; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2733; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2734; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2735; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2736; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 2737; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 2738; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 2739; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2740; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2741; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2742; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2743; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2744; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 2745; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 2746; GFX9-NODL-NEXT: s_endpgm 2747; 2748; GFX9-DL-LABEL: idot4_acc32_2ele: 2749; GFX9-DL: ; %bb.0: ; %entry 2750; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2751; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2752; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2753; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2754; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] 2755; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 2756; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2757; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0100 2758; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2759; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2760; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 2761; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2762; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 2763; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2764; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 2765; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 2766; GFX9-DL-NEXT: s_endpgm 2767; 2768; GFX10-DL-LABEL: idot4_acc32_2ele: 2769; GFX10-DL: ; %bb.0: ; %entry 2770; GFX10-DL-NEXT: s_clause 0x1 2771; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2772; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2773; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2774; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2775; GFX10-DL-NEXT: s_clause 0x1 2776; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] 2777; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 2778; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 2779; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2780; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2781; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0100 2782; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2783; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0100 2784; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 2785; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2786; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 2787; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 2788; GFX10-DL-NEXT: s_endpgm 2789; 2790; GFX11-DL-LABEL: idot4_acc32_2ele: 2791; GFX11-DL: ; %bb.0: ; %entry 2792; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2793; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2794; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2795; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 2796; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 2797; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2798; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2799; GFX11-DL-NEXT: s_clause 0x1 2800; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] 2801; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] 2802; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 2803; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 2804; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 2805; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 2806; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100 2807; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2808; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 2809; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 2810; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 2811; GFX11-DL-NEXT: s_endpgm 2812 ptr addrspace(1) %src2, 2813 ptr addrspace(1) nocapture %dst) { 2814entry: 2815 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2816 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 2817 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 2818 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 2819 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 2820 2821 %v1e0 = extractelement <4 x i8> %vec1, i64 0 2822 %cv1e0 = zext i8 %v1e0 to i32 2823 %v2e0 = extractelement <4 x i8> %vec2, i64 0 2824 %cv2e0 = zext i8 %v2e0 to i32 2825 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 2826 2827 %v1e1 = extractelement <4 x i8> %vec1, i64 1 2828 %cv1e1 = zext i8 %v1e1 to i32 2829 %v2e1 = extractelement <4 x i8> %vec2, i64 1 2830 %cv2e1 = zext i8 %v2e1 to i32 2831 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 2832 2833 %acc = load i32, ptr addrspace(1) %dst, align 4 2834 %add1 = add i32 %mul1, %acc 2835 %add2 = add i32 %add1, %mul2 2836 store i32 %add2, ptr addrspace(1) %dst, align 4 2837 ret void 2838} 2839 2840define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, 2841; GFX7-LABEL: idot4_acc32_3ele: 2842; GFX7: ; %bb.0: ; %entry 2843; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2844; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2845; GFX7-NEXT: s_mov_b32 s3, 0xf000 2846; GFX7-NEXT: s_mov_b32 s6, 0 2847; GFX7-NEXT: s_mov_b32 s7, s3 2848; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2849; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2850; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2851; GFX7-NEXT: v_mov_b32_e32 v1, 0 2852; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2853; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 2854; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2855; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 2856; GFX7-NEXT: s_mov_b32 s2, -1 2857; GFX7-NEXT: s_waitcnt vmcnt(1) 2858; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 2859; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 2860; GFX7-NEXT: s_waitcnt vmcnt(0) 2861; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 2862; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 2863; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2864; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s4 2865; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 2866; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 2867; GFX7-NEXT: v_mad_u32_u24 v1, v3, v5, v1 2868; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2869; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2870; GFX7-NEXT: s_endpgm 2871; 2872; GFX8-LABEL: idot4_acc32_3ele: 2873; GFX8: ; %bb.0: ; %entry 2874; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2875; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2876; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2877; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2878; GFX8-NEXT: v_mov_b32_e32 v1, s1 2879; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2880; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2881; GFX8-NEXT: flat_load_dword v3, v[0:1] 2882; GFX8-NEXT: v_mov_b32_e32 v1, s3 2883; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2884; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2885; GFX8-NEXT: flat_load_dword v0, v[0:1] 2886; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 2887; GFX8-NEXT: s_waitcnt vmcnt(1) 2888; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 2889; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 2890; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8 2891; GFX8-NEXT: s_waitcnt vmcnt(0) 2892; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 2893; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 2894; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2895; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 2896; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 2897; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 2898; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 2899; GFX8-NEXT: v_mov_b32_e32 v0, s4 2900; GFX8-NEXT: v_mov_b32_e32 v1, s5 2901; GFX8-NEXT: flat_store_dword v[0:1], v2 2902; GFX8-NEXT: s_endpgm 2903; 2904; GFX9-NODL-LABEL: idot4_acc32_3ele: 2905; GFX9-NODL: ; %bb.0: ; %entry 2906; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2907; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2908; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2909; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2910; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 2911; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 2912; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 2913; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2914; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2915; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 2916; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2917; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2 2918; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2919; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 2920; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2921; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 2922; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 2923; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 2924; GFX9-NODL-NEXT: s_endpgm 2925; 2926; GFX9-DL-LABEL: idot4_acc32_3ele: 2927; GFX9-DL: ; %bb.0: ; %entry 2928; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2929; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2930; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2931; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2932; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] 2933; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 2934; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2935; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 2936; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2937; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2938; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 2939; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2940; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 2941; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2942; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 2943; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 2944; GFX9-DL-NEXT: s_endpgm 2945; 2946; GFX10-DL-LABEL: idot4_acc32_3ele: 2947; GFX10-DL: ; %bb.0: ; %entry 2948; GFX10-DL-NEXT: s_clause 0x1 2949; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2950; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2951; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2952; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2953; GFX10-DL-NEXT: s_clause 0x1 2954; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] 2955; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 2956; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 2957; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2958; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2959; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100 2960; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2961; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100 2962; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 2963; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2964; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 2965; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 2966; GFX10-DL-NEXT: s_endpgm 2967; 2968; GFX11-DL-LABEL: idot4_acc32_3ele: 2969; GFX11-DL: ; %bb.0: ; %entry 2970; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2971; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2972; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2973; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 2974; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 2975; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2976; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2977; GFX11-DL-NEXT: s_clause 0x1 2978; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] 2979; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] 2980; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 2981; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 2982; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100 2983; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 2984; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100 2985; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2986; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 2987; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 2988; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 2989; GFX11-DL-NEXT: s_endpgm 2990 ptr addrspace(1) %src2, 2991 ptr addrspace(1) nocapture %dst) { 2992entry: 2993 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2994 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 2995 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 2996 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 2997 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 2998 2999 %v1e0 = extractelement <4 x i8> %vec1, i64 0 3000 %cv1e0 = zext i8 %v1e0 to i32 3001 %v2e0 = extractelement <4 x i8> %vec2, i64 0 3002 %cv2e0 = zext i8 %v2e0 to i32 3003 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 3004 3005 %v1e1 = extractelement <4 x i8> %vec1, i64 1 3006 %cv1e1 = zext i8 %v1e1 to i32 3007 %v2e1 = extractelement <4 x i8> %vec2, i64 1 3008 %cv2e1 = zext i8 %v2e1 to i32 3009 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 3010 3011 %v1e2 = extractelement <4 x i8> %vec1, i64 2 3012 %cv1e2 = zext i8 %v1e2 to i32 3013 %v2e2 = extractelement <4 x i8> %vec2, i64 2 3014 %cv2e2 = zext i8 %v2e2 to i32 3015 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 3016 3017 %acc = load i32, ptr addrspace(1) %dst, align 4 3018 %add1 = add i32 %mul1, %acc 3019 %add2 = add i32 %add1, %mul2 3020 %add3 = add i32 %add2, %mul3 3021 store i32 %add3, ptr addrspace(1) %dst, align 4 3022 ret void 3023} 3024 3025define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, 3026; GFX7-LABEL: idot4_acc32_3ele_permuted: 3027; GFX7: ; %bb.0: ; %entry 3028; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 3029; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 3030; GFX7-NEXT: s_mov_b32 s3, 0xf000 3031; GFX7-NEXT: s_mov_b32 s6, 0 3032; GFX7-NEXT: s_mov_b32 s7, s3 3033; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3034; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 3035; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3036; GFX7-NEXT: v_mov_b32_e32 v1, 0 3037; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3038; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 3039; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 3040; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 3041; GFX7-NEXT: s_mov_b32 s2, -1 3042; GFX7-NEXT: s_waitcnt vmcnt(1) 3043; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2 3044; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 3045; GFX7-NEXT: s_waitcnt vmcnt(0) 3046; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 3047; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 3048; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3049; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s4 3050; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 3051; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 3052; GFX7-NEXT: v_mad_u32_u24 v1, v3, v5, v1 3053; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 3054; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 3055; GFX7-NEXT: s_endpgm 3056; 3057; GFX8-LABEL: idot4_acc32_3ele_permuted: 3058; GFX8: ; %bb.0: ; %entry 3059; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3060; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 3061; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3062; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3063; GFX8-NEXT: v_mov_b32_e32 v1, s1 3064; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3065; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3066; GFX8-NEXT: flat_load_dword v3, v[0:1] 3067; GFX8-NEXT: v_mov_b32_e32 v1, s3 3068; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3069; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3070; GFX8-NEXT: flat_load_dword v0, v[0:1] 3071; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 3072; GFX8-NEXT: s_waitcnt vmcnt(1) 3073; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3 3074; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3 3075; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8 3076; GFX8-NEXT: s_waitcnt vmcnt(0) 3077; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 3078; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v0 3079; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3080; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 3081; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 3082; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 3083; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 3084; GFX8-NEXT: v_mov_b32_e32 v0, s4 3085; GFX8-NEXT: v_mov_b32_e32 v1, s5 3086; GFX8-NEXT: flat_store_dword v[0:1], v2 3087; GFX8-NEXT: s_endpgm 3088; 3089; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted: 3090; GFX9-NODL: ; %bb.0: ; %entry 3091; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3092; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3093; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3094; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 3095; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 3096; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 3097; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 3098; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 3099; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 3100; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 24, v1 3101; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 3102; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 3103; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 3104; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 3105; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 3106; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 3107; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 3108; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 3109; GFX9-NODL-NEXT: s_endpgm 3110; 3111; GFX9-DL-LABEL: idot4_acc32_3ele_permuted: 3112; GFX9-DL: ; %bb.0: ; %entry 3113; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3114; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3115; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3116; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3117; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] 3118; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 3119; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 3120; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020003 3121; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 3122; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 3123; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 3124; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 3125; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 3126; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3127; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 3128; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 3129; GFX9-DL-NEXT: s_endpgm 3130; 3131; GFX10-DL-LABEL: idot4_acc32_3ele_permuted: 3132; GFX10-DL: ; %bb.0: ; %entry 3133; GFX10-DL-NEXT: s_clause 0x1 3134; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3135; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3136; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3137; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3138; GFX10-DL-NEXT: s_clause 0x1 3139; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] 3140; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 3141; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 3142; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 3143; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 3144; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020003 3145; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 3146; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020003 3147; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 3148; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3149; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 3150; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 3151; GFX10-DL-NEXT: s_endpgm 3152; 3153; GFX11-DL-LABEL: idot4_acc32_3ele_permuted: 3154; GFX11-DL: ; %bb.0: ; %entry 3155; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3156; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3157; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3158; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 3159; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 3160; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3161; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 3162; GFX11-DL-NEXT: s_clause 0x1 3163; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] 3164; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] 3165; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 3166; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 3167; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020003 3168; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 3169; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020003 3170; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 3171; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 3172; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 3173; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 3174; GFX11-DL-NEXT: s_endpgm 3175 ptr addrspace(1) %src2, 3176 ptr addrspace(1) nocapture %dst) { 3177entry: 3178 %idx = call i32 @llvm.amdgcn.workitem.id.x() 3179 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 3180 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 3181 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 3182 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 3183 3184 %v1e0 = extractelement <4 x i8> %vec1, i64 3 3185 %cv1e0 = zext i8 %v1e0 to i32 3186 %v2e0 = extractelement <4 x i8> %vec2, i64 3 3187 %cv2e0 = zext i8 %v2e0 to i32 3188 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 3189 3190 %v1e1 = extractelement <4 x i8> %vec1, i64 0 3191 %cv1e1 = zext i8 %v1e1 to i32 3192 %v2e1 = extractelement <4 x i8> %vec2, i64 0 3193 %cv2e1 = zext i8 %v2e1 to i32 3194 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 3195 3196 %v1e2 = extractelement <4 x i8> %vec1, i64 2 3197 %cv1e2 = zext i8 %v1e2 to i32 3198 %v2e2 = extractelement <4 x i8> %vec2, i64 2 3199 %cv2e2 = zext i8 %v2e2 to i32 3200 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 3201 3202 %acc = load i32, ptr addrspace(1) %dst, align 4 3203 %add1 = add i32 %mul1, %acc 3204 %add2 = add i32 %add1, %mul2 3205 %add3 = add i32 %add2, %mul3 3206 store i32 %add3, ptr addrspace(1) %dst, align 4 3207 ret void 3208} 3209 3210 3211define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, 3212; GFX7-LABEL: idot4_acc32_opt: 3213; GFX7: ; %bb.0: ; %entry 3214; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 3215; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 3216; GFX7-NEXT: s_mov_b32 s3, 0xf000 3217; GFX7-NEXT: s_mov_b32 s6, 0 3218; GFX7-NEXT: s_mov_b32 s7, s3 3219; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3220; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 3221; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3222; GFX7-NEXT: v_mov_b32_e32 v1, 0 3223; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3224; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 3225; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 3226; GFX7-NEXT: s_mov_b32 s2, -1 3227; GFX7-NEXT: s_waitcnt vmcnt(1) 3228; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 3229; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 3230; GFX7-NEXT: s_waitcnt vmcnt(0) 3231; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 3232; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 3233; GFX7-NEXT: v_mul_u32_u24_e32 v3, v3, v6 3234; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 3235; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 3236; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 3237; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 3238; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 3239; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 3240; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 3241; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 3242; GFX7-NEXT: s_endpgm 3243; 3244; GFX8-LABEL: idot4_acc32_opt: 3245; GFX8: ; %bb.0: ; %entry 3246; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3247; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 3248; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3249; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3250; GFX8-NEXT: v_mov_b32_e32 v1, s1 3251; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3252; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3253; GFX8-NEXT: flat_load_dword v3, v[0:1] 3254; GFX8-NEXT: v_mov_b32_e32 v1, s3 3255; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3256; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3257; GFX8-NEXT: flat_load_dword v2, v[0:1] 3258; GFX8-NEXT: v_mov_b32_e32 v0, s4 3259; GFX8-NEXT: v_mov_b32_e32 v1, s5 3260; GFX8-NEXT: s_waitcnt vmcnt(1) 3261; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3 3262; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 8 3263; GFX8-NEXT: s_waitcnt vmcnt(0) 3264; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2 3265; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 3266; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8 3267; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6 3268; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 3269; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 3270; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4 3271; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v4 3272; GFX8-NEXT: flat_store_dword v[0:1], v2 3273; GFX8-NEXT: s_endpgm 3274; 3275; GFX9-NODL-LABEL: idot4_acc32_opt: 3276; GFX9-NODL: ; %bb.0: ; %entry 3277; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3278; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3279; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3280; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 3281; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 3282; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 3283; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 3284; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 3285; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 3286; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 3287; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2 3288; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 3289; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 3290; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 3291; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 3292; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 3293; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 3294; GFX9-NODL-NEXT: s_endpgm 3295; 3296; GFX9-DL-LABEL: idot4_acc32_opt: 3297; GFX9-DL: ; %bb.0: ; %entry 3298; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3299; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3300; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3301; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3302; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 3303; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 3304; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 3305; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 3306; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 3307; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 3308; GFX9-DL-NEXT: s_endpgm 3309; 3310; GFX10-DL-LABEL: idot4_acc32_opt: 3311; GFX10-DL: ; %bb.0: ; %entry 3312; GFX10-DL-NEXT: s_clause 0x1 3313; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3314; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3315; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3316; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3317; GFX10-DL-NEXT: s_clause 0x1 3318; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 3319; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 3320; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 3321; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 3322; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 3323; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] 3324; GFX10-DL-NEXT: s_endpgm 3325; 3326; GFX11-DL-LABEL: idot4_acc32_opt: 3327; GFX11-DL: ; %bb.0: ; %entry 3328; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3329; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3330; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3331; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 3332; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 3333; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3334; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 3335; GFX11-DL-NEXT: s_clause 0x1 3336; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 3337; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 3338; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 3339; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 3340; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 3341; GFX11-DL-NEXT: s_endpgm 3342 ptr addrspace(1) %src2, 3343 ptr addrspace(1) nocapture %dst) { 3344entry: 3345 %idx = call i32 @llvm.amdgcn.workitem.id.x() 3346 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 3347 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 3348 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 3349 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 3350 3351 %v1e0 = extractelement <4 x i8> %vec1, i64 0 3352 %cv1e0 = zext i8 %v1e0 to i32 3353 %v2e0 = extractelement <4 x i8> %vec2, i64 0 3354 %cv2e0 = zext i8 %v2e0 to i32 3355 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 3356 3357 %v1e1 = extractelement <4 x i8> %vec1, i64 1 3358 %cv1e1 = zext i8 %v1e1 to i32 3359 %v2e1 = extractelement <4 x i8> %vec2, i64 1 3360 %cv2e1 = zext i8 %v2e1 to i32 3361 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 3362 3363 %v1e2 = extractelement <4 x i8> %vec1, i64 2 3364 %cv1e2 = zext i8 %v1e2 to i32 3365 %v2e2 = extractelement <4 x i8> %vec2, i64 2 3366 %cv2e2 = zext i8 %v2e2 to i32 3367 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 3368 3369 %v1e3 = extractelement <4 x i8> %vec1, i64 3 3370 %cv1e3 = zext i8 %v1e3 to i32 3371 %v2e3 = extractelement <4 x i8> %vec2, i64 3 3372 %cv2e3 = zext i8 %v2e3 to i32 3373 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 3374 3375 %add2 = add i32 %mul1, %mul2 3376 %add3 = add i32 %add2, %mul3 3377 %add4 = add i32 %add3, %mul4 3378 store i32 %add4, ptr addrspace(1) %dst, align 4 3379 ret void 3380} 3381 3382define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, 3383; GFX7-LABEL: udot4_acc32_3src: 3384; GFX7: ; %bb.0: ; %entry 3385; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 3386; GFX7-NEXT: s_mov_b32 s11, 0xf000 3387; GFX7-NEXT: s_mov_b32 s14, 0 3388; GFX7-NEXT: s_mov_b32 s15, s11 3389; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3390; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3391; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1] 3392; GFX7-NEXT: v_mov_b32_e32 v1, 0 3393; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 3394; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3] 3395; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 3396; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] 3397; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 3398; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0 3399; GFX7-NEXT: s_mov_b32 s10, -1 3400; GFX7-NEXT: s_mov_b32 s8, s6 3401; GFX7-NEXT: s_mov_b32 s9, s7 3402; GFX7-NEXT: s_waitcnt vmcnt(2) 3403; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 3404; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 3405; GFX7-NEXT: s_waitcnt vmcnt(1) 3406; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8 3407; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3408; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s0 3409; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 3410; GFX7-NEXT: s_waitcnt vmcnt(0) 3411; GFX7-NEXT: v_bfe_u32 v6, v0, 16, 8 3412; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1 3413; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 3414; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 3415; GFX7-NEXT: v_mad_u32_u24 v1, v5, v6, v1 3416; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 3417; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 3418; GFX7-NEXT: s_endpgm 3419; 3420; GFX8-LABEL: udot4_acc32_3src: 3421; GFX8: ; %bb.0: ; %entry 3422; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 3423; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3424; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3425; GFX8-NEXT: v_mov_b32_e32 v1, s1 3426; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3427; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3428; GFX8-NEXT: flat_load_dword v3, v[0:1] 3429; GFX8-NEXT: v_mov_b32_e32 v1, s3 3430; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3431; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3432; GFX8-NEXT: flat_load_dword v4, v[0:1] 3433; GFX8-NEXT: v_mov_b32_e32 v1, s5 3434; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 3435; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3436; GFX8-NEXT: flat_load_dword v0, v[0:1] 3437; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 3438; GFX8-NEXT: s_waitcnt vmcnt(2) 3439; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 3440; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8 3441; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3442; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s0 3443; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 8 3444; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 3445; GFX8-NEXT: s_waitcnt vmcnt(1) 3446; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 8 3447; GFX8-NEXT: v_mad_u32_u24 v1, v2, v4, v1 3448; GFX8-NEXT: s_waitcnt vmcnt(0) 3449; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8 3450; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 3451; GFX8-NEXT: v_mad_u32_u24 v1, v5, v6, v1 3452; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 3453; GFX8-NEXT: v_mov_b32_e32 v0, s6 3454; GFX8-NEXT: v_mov_b32_e32 v1, s7 3455; GFX8-NEXT: flat_store_dword v[0:1], v2 3456; GFX8-NEXT: s_endpgm 3457; 3458; GFX9-NODL-LABEL: udot4_acc32_3src: 3459; GFX9-NODL: ; %bb.0: ; %entry 3460; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 3461; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3462; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 3463; GFX9-NODL-NEXT: global_load_dword v1, v0, s[8:9] 3464; GFX9-NODL-NEXT: global_load_dword v2, v0, s[10:11] 3465; GFX9-NODL-NEXT: global_load_dword v3, v0, s[12:13] 3466; GFX9-NODL-NEXT: s_load_dword s0, s[14:15], 0x0 3467; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 3468; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 3469; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 3470; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 3471; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 3472; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 3473; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 3474; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 3475; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 3476; GFX9-NODL-NEXT: v_add3_u32 v2, v4, s0, v2 3477; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 3478; GFX9-NODL-NEXT: global_store_dword v0, v1, s[14:15] 3479; GFX9-NODL-NEXT: s_endpgm 3480; 3481; GFX9-DL-LABEL: udot4_acc32_3src: 3482; GFX9-DL: ; %bb.0: ; %entry 3483; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 3484; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3485; GFX9-DL-NEXT: s_mov_b32 s0, 0x706010c 3486; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0c00 3487; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3488; GFX9-DL-NEXT: global_load_dword v1, v0, s[10:11] 3489; GFX9-DL-NEXT: global_load_dword v2, v0, s[12:13] 3490; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9] 3491; GFX9-DL-NEXT: s_load_dword s1, s[14:15], 0x0 3492; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 3493; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 3494; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 3495; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 3496; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 3497; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2 3498; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3499; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v3, v1, s1 3500; GFX9-DL-NEXT: global_store_dword v0, v1, s[14:15] 3501; GFX9-DL-NEXT: s_endpgm 3502; 3503; GFX10-DL-LABEL: udot4_acc32_3src: 3504; GFX10-DL: ; %bb.0: ; %entry 3505; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 3506; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3507; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3508; GFX10-DL-NEXT: s_clause 0x2 3509; GFX10-DL-NEXT: global_load_dword v1, v0, s[10:11] 3510; GFX10-DL-NEXT: global_load_dword v2, v0, s[12:13] 3511; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9] 3512; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 3513; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 3514; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0x706010c 3515; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 3516; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc0c0c00 3517; GFX10-DL-NEXT: v_or_b32_e32 v0, v0, v1 3518; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 3519; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3520; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v0, s0 3521; GFX10-DL-NEXT: global_store_dword v1, v0, s[14:15] 3522; GFX10-DL-NEXT: s_endpgm 3523; 3524; GFX11-DL-LABEL: udot4_acc32_3src: 3525; GFX11-DL: ; %bb.0: ; %entry 3526; GFX11-DL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 3527; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3528; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 3529; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3530; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 3531; GFX11-DL-NEXT: s_clause 0x2 3532; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] 3533; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] 3534; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] 3535; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0 3536; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 3537; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0x706010c 3538; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 3539; GFX11-DL-NEXT: v_perm_b32 v2, v0, v0, 0xc0c0c00 3540; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 3541; GFX11-DL-NEXT: v_or_b32_e32 v1, v1, v2 3542; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 3543; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 3544; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 3545; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7] 3546; GFX11-DL-NEXT: s_endpgm 3547 ptr addrspace(1) %src2, 3548 ptr addrspace(1) %src3, 3549 ptr addrspace(1) nocapture %dst) { 3550entry: 3551 %idx = call i32 @llvm.amdgcn.workitem.id.x() 3552 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 3553 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 3554 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 3555 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 3556 %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx 3557 %vec3 = load <4 x i8>, ptr addrspace(1) %gep3 3558 3559 %v1e0 = extractelement <4 x i8> %vec1, i64 0 3560 %cv1e0 = zext i8 %v1e0 to i32 3561 %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0 3562 3563 %v1e1 = extractelement <4 x i8> %vec1, i64 1 3564 %cv1e1 = zext i8 %v1e1 to i32 3565 %v2e1 = extractelement <4 x i8> %vec2, i64 1 3566 %cv2e1 = zext i8 %v2e1 to i32 3567 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 3568 3569 %v1e2 = extractelement <4 x i8> %vec1, i64 2 3570 %cv1e2 = zext i8 %v1e2 to i32 3571 %v3e2 = extractelement <4 x i8> %vec3, i64 2 3572 %cv3e2 = zext i8 %v3e2 to i32 3573 %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2 3574 3575 %v1e3 = extractelement <4 x i8> %vec1, i64 3 3576 %cv1e3 = zext i8 %v1e3 to i32 3577 %v3e3 = extractelement <4 x i8> %vec3, i64 3 3578 %cv3e3 = zext i8 %v3e3 to i32 3579 %mul4 = mul nuw nsw i32 %cv1e3, %cv3e3 3580 3581 %acc = load i32, ptr addrspace(1) %dst, align 4 3582 %mad1 = add i32 %mul1, %acc 3583 %mad2 = add i32 %mad1, %mul2 3584 %mad3 = add i32 %mad2, %mul3 3585 %mad4 = add i32 %mad3, %mul4 3586 3587 store i32 %mad4, ptr addrspace(1) %dst, align 4 3588 ret void 3589} 3590 3591define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, 3592; GFX7-LABEL: udot4_acc32_3src_3ele: 3593; GFX7: ; %bb.0: ; %entry 3594; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 3595; GFX7-NEXT: s_mov_b32 s11, 0xf000 3596; GFX7-NEXT: s_mov_b32 s14, 0 3597; GFX7-NEXT: s_mov_b32 s15, s11 3598; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3599; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3600; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1] 3601; GFX7-NEXT: v_mov_b32_e32 v1, 0 3602; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 3603; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3] 3604; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 3605; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] 3606; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 3607; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0 3608; GFX7-NEXT: s_mov_b32 s10, -1 3609; GFX7-NEXT: s_mov_b32 s8, s6 3610; GFX7-NEXT: s_mov_b32 s9, s7 3611; GFX7-NEXT: s_waitcnt vmcnt(2) 3612; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 3613; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 3614; GFX7-NEXT: s_waitcnt vmcnt(1) 3615; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8 3616; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3617; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s0 3618; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 3619; GFX7-NEXT: s_waitcnt vmcnt(0) 3620; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 3621; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1 3622; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 3623; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 3624; GFX7-NEXT: s_endpgm 3625; 3626; GFX8-LABEL: udot4_acc32_3src_3ele: 3627; GFX8: ; %bb.0: ; %entry 3628; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 3629; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3630; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3631; GFX8-NEXT: v_mov_b32_e32 v1, s1 3632; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3633; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3634; GFX8-NEXT: flat_load_dword v3, v[0:1] 3635; GFX8-NEXT: v_mov_b32_e32 v1, s3 3636; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3637; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3638; GFX8-NEXT: flat_load_dword v4, v[0:1] 3639; GFX8-NEXT: v_mov_b32_e32 v1, s5 3640; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 3641; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3642; GFX8-NEXT: flat_load_dword v0, v[0:1] 3643; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 3644; GFX8-NEXT: s_waitcnt vmcnt(2) 3645; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 3646; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8 3647; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3648; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s0 3649; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8 3650; GFX8-NEXT: s_waitcnt vmcnt(1) 3651; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 8 3652; GFX8-NEXT: v_mad_u32_u24 v1, v2, v4, v1 3653; GFX8-NEXT: s_waitcnt vmcnt(0) 3654; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 3655; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 3656; GFX8-NEXT: v_mov_b32_e32 v0, s6 3657; GFX8-NEXT: v_mov_b32_e32 v1, s7 3658; GFX8-NEXT: flat_store_dword v[0:1], v2 3659; GFX8-NEXT: s_endpgm 3660; 3661; GFX9-NODL-LABEL: udot4_acc32_3src_3ele: 3662; GFX9-NODL: ; %bb.0: ; %entry 3663; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 3664; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3665; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 3666; GFX9-NODL-NEXT: global_load_dword v1, v0, s[8:9] 3667; GFX9-NODL-NEXT: global_load_dword v2, v0, s[10:11] 3668; GFX9-NODL-NEXT: global_load_dword v3, v0, s[12:13] 3669; GFX9-NODL-NEXT: s_load_dword s0, s[14:15], 0x0 3670; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 3671; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 3672; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 3673; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 3674; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 3675; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 3676; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 3677; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 3678; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v4, s0 3679; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1 3680; GFX9-NODL-NEXT: global_store_dword v0, v1, s[14:15] 3681; GFX9-NODL-NEXT: s_endpgm 3682; 3683; GFX9-DL-LABEL: udot4_acc32_3src_3ele: 3684; GFX9-DL: ; %bb.0: ; %entry 3685; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 3686; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3687; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c 3688; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c00 3689; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020100 3690; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3691; GFX9-DL-NEXT: global_load_dword v1, v0, s[10:11] 3692; GFX9-DL-NEXT: global_load_dword v2, v0, s[12:13] 3693; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9] 3694; GFX9-DL-NEXT: s_load_dword s3, s[14:15], 0x0 3695; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 3696; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 3697; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 3698; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 3699; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s1 3700; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2 3701; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 3702; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3703; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s3 3704; GFX9-DL-NEXT: global_store_dword v0, v1, s[14:15] 3705; GFX9-DL-NEXT: s_endpgm 3706; 3707; GFX10-DL-LABEL: udot4_acc32_3src_3ele: 3708; GFX10-DL: ; %bb.0: ; %entry 3709; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 3710; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3711; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3712; GFX10-DL-NEXT: s_clause 0x2 3713; GFX10-DL-NEXT: global_load_dword v1, v0, s[10:11] 3714; GFX10-DL-NEXT: global_load_dword v2, v0, s[12:13] 3715; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9] 3716; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 3717; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 3718; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c 3719; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 3720; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc0c0c00 3721; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 3722; GFX10-DL-NEXT: v_or_b32_e32 v0, v0, v1 3723; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc020100 3724; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3725; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 3726; GFX10-DL-NEXT: global_store_dword v2, v0, s[14:15] 3727; GFX10-DL-NEXT: s_endpgm 3728; 3729; GFX11-DL-LABEL: udot4_acc32_3src_3ele: 3730; GFX11-DL: ; %bb.0: ; %entry 3731; GFX11-DL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 3732; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3733; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 3734; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3735; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 3736; GFX11-DL-NEXT: s_clause 0x2 3737; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] 3738; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] 3739; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] 3740; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0 3741; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 3742; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc06010c 3743; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 3744; GFX11-DL-NEXT: v_perm_b32 v2, v0, v0, 0xc0c0c00 3745; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100 3746; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 3747; GFX11-DL-NEXT: v_or_b32_e32 v1, v1, v2 3748; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 3749; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 3750; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 3751; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7] 3752; GFX11-DL-NEXT: s_endpgm 3753 ptr addrspace(1) %src2, 3754 ptr addrspace(1) %src3, 3755 ptr addrspace(1) nocapture %dst) { 3756entry: 3757 %idx = call i32 @llvm.amdgcn.workitem.id.x() 3758 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 3759 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 3760 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 3761 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 3762 %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx 3763 %vec3 = load <4 x i8>, ptr addrspace(1) %gep3 3764 3765 %v1e0 = extractelement <4 x i8> %vec1, i64 0 3766 %cv1e0 = zext i8 %v1e0 to i32 3767 %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0 3768 3769 %v1e1 = extractelement <4 x i8> %vec1, i64 1 3770 %cv1e1 = zext i8 %v1e1 to i32 3771 %v2e1 = extractelement <4 x i8> %vec2, i64 1 3772 %cv2e1 = zext i8 %v2e1 to i32 3773 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 3774 3775 %v1e2 = extractelement <4 x i8> %vec1, i64 2 3776 %cv1e2 = zext i8 %v1e2 to i32 3777 %v3e2 = extractelement <4 x i8> %vec3, i64 2 3778 %cv3e2 = zext i8 %v3e2 to i32 3779 %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2 3780 3781 3782 %acc = load i32, ptr addrspace(1) %dst, align 4 3783 %mad1 = add i32 %mul1, %acc 3784 %mad2 = add i32 %mad1, %mul2 3785 %mad3 = add i32 %mad2, %mul3 3786 3787 store i32 %mad3, ptr addrspace(1) %dst, align 4 3788 ret void 3789} 3790 3791define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, 3792; GFX7-LABEL: udot4_bad_source: 3793; GFX7: ; %bb.0: ; %entry 3794; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3795; GFX7-NEXT: s_load_dword s12, s[4:5], 0xf 3796; GFX7-NEXT: s_mov_b32 s7, 0xf000 3797; GFX7-NEXT: s_mov_b32 s10, 0 3798; GFX7-NEXT: s_mov_b32 s11, s7 3799; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3800; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 3801; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3802; GFX7-NEXT: v_mov_b32_e32 v1, 0 3803; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 3804; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 3805; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 3806; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 3807; GFX7-NEXT: s_and_b32 s1, s12, 0xffff 3808; GFX7-NEXT: s_mov_b32 s6, -1 3809; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3810; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 3811; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3812; GFX7-NEXT: v_mov_b32_e32 v1, s0 3813; GFX7-NEXT: s_waitcnt vmcnt(1) 3814; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 3815; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 3816; GFX7-NEXT: s_waitcnt vmcnt(0) 3817; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 3818; GFX7-NEXT: v_mad_u32_u24 v1, v3, s1, v1 3819; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 3820; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 3821; GFX7-NEXT: v_mad_u32_u24 v1, v4, v5, v1 3822; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 3823; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 3824; GFX7-NEXT: s_endpgm 3825; 3826; GFX8-LABEL: udot4_bad_source: 3827; GFX8: ; %bb.0: ; %entry 3828; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3829; GFX8-NEXT: s_load_dword s6, s[4:5], 0x3c 3830; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3831; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3832; GFX8-NEXT: v_mov_b32_e32 v1, s1 3833; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3834; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3835; GFX8-NEXT: flat_load_dword v3, v[0:1] 3836; GFX8-NEXT: v_mov_b32_e32 v1, s3 3837; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3838; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3839; GFX8-NEXT: flat_load_dword v0, v[0:1] 3840; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 3841; GFX8-NEXT: s_and_b32 s3, s6, 0xffff 3842; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3843; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 3844; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3845; GFX8-NEXT: v_mov_b32_e32 v1, s2 3846; GFX8-NEXT: s_waitcnt vmcnt(1) 3847; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v3 3848; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 3849; GFX8-NEXT: v_mad_u32_u24 v1, v2, s3, v1 3850; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8 3851; GFX8-NEXT: s_waitcnt vmcnt(0) 3852; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 3853; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 3854; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 3855; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 3856; GFX8-NEXT: v_mov_b32_e32 v0, s0 3857; GFX8-NEXT: v_mov_b32_e32 v1, s1 3858; GFX8-NEXT: flat_store_dword v[0:1], v2 3859; GFX8-NEXT: s_endpgm 3860; 3861; GFX9-NODL-LABEL: udot4_bad_source: 3862; GFX9-NODL: ; %bb.0: ; %entry 3863; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3864; GFX9-NODL-NEXT: s_load_dword s6, s[4:5], 0x3c 3865; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3866; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 3867; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 3868; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 3869; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 3870; GFX9-NODL-NEXT: s_and_b32 s3, s6, 0xffff 3871; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 3872; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 3873; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 3874; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 3875; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 3876; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 3877; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 3878; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 3879; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 3880; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 3881; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, s3, v2 3882; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1 3883; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 3884; GFX9-NODL-NEXT: s_endpgm 3885; 3886; GFX9-DL-LABEL: udot4_bad_source: 3887; GFX9-DL: ; %bb.0: ; %entry 3888; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3889; GFX9-DL-NEXT: s_load_dword s6, s[4:5], 0x3c 3890; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3891; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3892; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 3893; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 3894; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 3895; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0201 3896; GFX9-DL-NEXT: s_and_b32 s4, s6, 0xffff 3897; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 3898; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3899; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 3900; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3901; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 3902; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 3903; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v1 3904; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 3905; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 3906; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, s4, v3 3907; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 3908; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 3909; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 3910; GFX9-DL-NEXT: s_endpgm 3911; 3912; GFX10-DL-LABEL: udot4_bad_source: 3913; GFX10-DL: ; %bb.0: ; %entry 3914; GFX10-DL-NEXT: s_clause 0x1 3915; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3916; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x3c 3917; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3918; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 3919; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3920; GFX10-DL-NEXT: s_clause 0x1 3921; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 3922; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 3923; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 3924; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 3925; GFX10-DL-NEXT: s_and_b32 s3, s6, 0xffff 3926; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3927; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 3928; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 3929; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1 3930; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 3931; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201 3932; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 3933; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3934; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, s3, s2 3935; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v2, v0 3936; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] 3937; GFX10-DL-NEXT: s_endpgm 3938; 3939; GFX11-DL-LABEL: udot4_bad_source: 3940; GFX11-DL: ; %bb.0: ; %entry 3941; GFX11-DL-NEXT: s_clause 0x1 3942; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3943; GFX11-DL-NEXT: s_load_b32 s6, s[4:5], 0x3c 3944; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 3945; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 3946; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3947; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 3948; GFX11-DL-NEXT: s_clause 0x1 3949; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 3950; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 3951; GFX11-DL-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 3952; GFX11-DL-NEXT: s_and_b32 s3, s6, 0xffff 3953; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 3954; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 3955; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 3956; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1 3957; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 3958; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201 3959; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 3960; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 3961; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, s3, s2 3962; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 3963; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2 3964; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] 3965; GFX11-DL-NEXT: s_endpgm 3966 ptr addrspace(1) %src2, 3967 ptr addrspace(1) %src3, 3968 i16 %badsource, 3969 ptr addrspace(1) nocapture %dst) { 3970entry: 3971 %idx = call i32 @llvm.amdgcn.workitem.id.x() 3972 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 3973 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 3974 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 3975 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 3976 %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx 3977 %vec3 = load <4 x i8>, ptr addrspace(1) %gep3 3978 3979 %v1e0 = extractelement <4 x i8> %vec1, i64 0 3980 %cv1e0 = zext i8 %v1e0 to i32 3981 %v2e0 = extractelement <4 x i8> %vec2, i64 0 3982 %other = zext i16 %badsource to i32 3983 %mul1 = mul nuw nsw i32 %cv1e0, %other 3984 3985 %v1e1 = extractelement <4 x i8> %vec1, i64 1 3986 %cv1e1 = zext i8 %v1e1 to i32 3987 %v2e1 = extractelement <4 x i8> %vec2, i64 1 3988 %cv2e1 = zext i8 %v2e1 to i32 3989 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 3990 3991 %v2e2 = extractelement <4 x i8> %vec2, i64 2 3992 %cv2e2 = zext i8 %v2e2 to i32 3993 %v1e2 = extractelement <4 x i8> %vec1, i64 2 3994 %cv1e2 = zext i8 %v1e2 to i32 3995 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 3996 3997 3998 %acc = load i32, ptr addrspace(1) %dst, align 4 3999 %mad1 = add i32 %mul1, %acc 4000 %mad2 = add i32 %mad1, %mul2 4001 %mad3 = add i32 %mad2, %mul3 4002 4003 store i32 %mad3, ptr addrspace(1) %dst, align 4 4004 ret void 4005} 4006 4007 4008define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, 4009; GFX7-LABEL: udot4_commutative: 4010; GFX7: ; %bb.0: ; %entry 4011; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 4012; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xf 4013; GFX7-NEXT: s_mov_b32 s3, 0xf000 4014; GFX7-NEXT: s_mov_b32 s6, 0 4015; GFX7-NEXT: s_mov_b32 s7, s3 4016; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4017; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 4018; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4019; GFX7-NEXT: v_mov_b32_e32 v1, 0 4020; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 4021; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 4022; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 4023; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 4024; GFX7-NEXT: s_mov_b32 s2, -1 4025; GFX7-NEXT: s_waitcnt vmcnt(1) 4026; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 4027; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 4028; GFX7-NEXT: s_waitcnt vmcnt(0) 4029; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 4030; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 4031; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4032; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s4 4033; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 4034; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 4035; GFX7-NEXT: v_mad_u32_u24 v1, v3, v5, v1 4036; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 4037; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 4038; GFX7-NEXT: s_endpgm 4039; 4040; GFX8-LABEL: udot4_commutative: 4041; GFX8: ; %bb.0: ; %entry 4042; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4043; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x3c 4044; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 4045; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4046; GFX8-NEXT: v_mov_b32_e32 v1, s1 4047; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 4048; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4049; GFX8-NEXT: flat_load_dword v3, v[0:1] 4050; GFX8-NEXT: v_mov_b32_e32 v1, s3 4051; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 4052; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4053; GFX8-NEXT: flat_load_dword v0, v[0:1] 4054; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 4055; GFX8-NEXT: s_waitcnt vmcnt(1) 4056; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 4057; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 4058; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8 4059; GFX8-NEXT: s_waitcnt vmcnt(0) 4060; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 4061; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 4062; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4063; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 4064; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 4065; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 4066; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 4067; GFX8-NEXT: v_mov_b32_e32 v0, s4 4068; GFX8-NEXT: v_mov_b32_e32 v1, s5 4069; GFX8-NEXT: flat_store_dword v[0:1], v2 4070; GFX8-NEXT: s_endpgm 4071; 4072; GFX9-NODL-LABEL: udot4_commutative: 4073; GFX9-NODL: ; %bb.0: ; %entry 4074; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4075; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 4076; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4077; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 4078; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 4079; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 4080; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 4081; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 4082; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 4083; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 4084; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 4085; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2 4086; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 4087; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 4088; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 4089; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 4090; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 4091; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 4092; GFX9-NODL-NEXT: s_endpgm 4093; 4094; GFX9-DL-LABEL: udot4_commutative: 4095; GFX9-DL: ; %bb.0: ; %entry 4096; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4097; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 4098; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4099; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 4100; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] 4101; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 4102; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 4103; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 4104; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 4105; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 4106; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 4107; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 4108; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 4109; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 4110; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 4111; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 4112; GFX9-DL-NEXT: s_endpgm 4113; 4114; GFX10-DL-LABEL: udot4_commutative: 4115; GFX10-DL: ; %bb.0: ; %entry 4116; GFX10-DL-NEXT: s_clause 0x1 4117; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4118; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 4119; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4120; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 4121; GFX10-DL-NEXT: s_clause 0x1 4122; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] 4123; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 4124; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 4125; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 4126; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 4127; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100 4128; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 4129; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100 4130; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 4131; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 4132; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 4133; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 4134; GFX10-DL-NEXT: s_endpgm 4135; 4136; GFX11-DL-LABEL: udot4_commutative: 4137; GFX11-DL: ; %bb.0: ; %entry 4138; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4139; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4140; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x3c 4141; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 4142; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 4143; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4144; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 4145; GFX11-DL-NEXT: s_clause 0x1 4146; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] 4147; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] 4148; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 4149; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 4150; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100 4151; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 4152; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100 4153; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 4154; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 4155; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 4156; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 4157; GFX11-DL-NEXT: s_endpgm 4158 ptr addrspace(1) %src2, 4159 ptr addrspace(1) %src3, 4160 ptr addrspace(1) nocapture %dst) { 4161entry: 4162 %idx = call i32 @llvm.amdgcn.workitem.id.x() 4163 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 4164 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 4165 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 4166 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 4167 %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx 4168 %vec3 = load <4 x i8>, ptr addrspace(1) %gep3 4169 4170 %v1e0 = extractelement <4 x i8> %vec1, i64 0 4171 %cv1e0 = zext i8 %v1e0 to i32 4172 %v2e0 = extractelement <4 x i8> %vec2, i64 0 4173 %cv2e0 = zext i8 %v2e0 to i32 4174 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 4175 4176 %v1e1 = extractelement <4 x i8> %vec1, i64 1 4177 %cv1e1 = zext i8 %v1e1 to i32 4178 %v2e1 = extractelement <4 x i8> %vec2, i64 1 4179 %cv2e1 = zext i8 %v2e1 to i32 4180 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 4181 4182 %v2e2 = extractelement <4 x i8> %vec2, i64 2 4183 %cv2e2 = zext i8 %v2e2 to i32 4184 %v1e2 = extractelement <4 x i8> %vec1, i64 2 4185 %cv1e2 = zext i8 %v1e2 to i32 4186 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 4187 4188 4189 %acc = load i32, ptr addrspace(1) %dst, align 4 4190 %mad1 = add i32 %mul1, %acc 4191 %mad2 = add i32 %mad1, %mul2 4192 %mad3 = add i32 %mad2, %mul3 4193 4194 store i32 %mad3, ptr addrspace(1) %dst, align 4 4195 ret void 4196} 4197 4198define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, 4199; GFX7-LABEL: udot4_acc32_3src_3ele_src0: 4200; GFX7: ; %bb.0: ; %entry 4201; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 4202; GFX7-NEXT: s_mov_b32 s11, 0xf000 4203; GFX7-NEXT: s_mov_b32 s14, 0 4204; GFX7-NEXT: s_mov_b32 s15, s11 4205; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4206; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4207; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1] 4208; GFX7-NEXT: v_mov_b32_e32 v1, 0 4209; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 4210; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3] 4211; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 4212; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] 4213; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 4214; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0 4215; GFX7-NEXT: s_mov_b32 s10, -1 4216; GFX7-NEXT: s_mov_b32 s8, s6 4217; GFX7-NEXT: s_mov_b32 s9, s7 4218; GFX7-NEXT: s_waitcnt vmcnt(2) 4219; GFX7-NEXT: v_bfe_u32 v1, v2, 8, 8 4220; GFX7-NEXT: s_waitcnt vmcnt(1) 4221; GFX7-NEXT: v_bfe_u32 v2, v3, 8, 8 4222; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4223; GFX7-NEXT: v_mad_u32_u24 v4, v2, v2, s0 4224; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 4225; GFX7-NEXT: s_waitcnt vmcnt(0) 4226; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 4227; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, v4 4228; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1 4229; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 4230; GFX7-NEXT: s_endpgm 4231; 4232; GFX8-LABEL: udot4_acc32_3src_3ele_src0: 4233; GFX8: ; %bb.0: ; %entry 4234; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 4235; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 4236; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4237; GFX8-NEXT: v_mov_b32_e32 v1, s1 4238; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 4239; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4240; GFX8-NEXT: flat_load_dword v3, v[0:1] 4241; GFX8-NEXT: v_mov_b32_e32 v1, s3 4242; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 4243; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4244; GFX8-NEXT: flat_load_dword v4, v[0:1] 4245; GFX8-NEXT: v_mov_b32_e32 v1, s5 4246; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 4247; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4248; GFX8-NEXT: flat_load_dword v0, v[0:1] 4249; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 4250; GFX8-NEXT: s_waitcnt vmcnt(2) 4251; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8 4252; GFX8-NEXT: s_waitcnt vmcnt(1) 4253; GFX8-NEXT: v_bfe_u32 v1, v4, 8, 8 4254; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 8 4255; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4256; GFX8-NEXT: v_mad_u32_u24 v4, v1, v1, s0 4257; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v4 4258; GFX8-NEXT: s_waitcnt vmcnt(0) 4259; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 4260; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 4261; GFX8-NEXT: v_mov_b32_e32 v0, s6 4262; GFX8-NEXT: v_mov_b32_e32 v1, s7 4263; GFX8-NEXT: flat_store_dword v[0:1], v2 4264; GFX8-NEXT: s_endpgm 4265; 4266; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0: 4267; GFX9-NODL: ; %bb.0: ; %entry 4268; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 4269; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4270; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 4271; GFX9-NODL-NEXT: global_load_dword v1, v0, s[10:11] 4272; GFX9-NODL-NEXT: global_load_dword v2, v0, s[12:13] 4273; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] 4274; GFX9-NODL-NEXT: s_load_dword s0, s[14:15], 0x0 4275; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 4276; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 4277; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8 4278; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 4279; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 4280; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 4281; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 4282; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 4283; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v4, s0 4284; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1 4285; GFX9-NODL-NEXT: global_store_dword v0, v1, s[14:15] 4286; GFX9-NODL-NEXT: s_endpgm 4287; 4288; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0: 4289; GFX9-DL: ; %bb.0: ; %entry 4290; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 4291; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4292; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c 4293; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01 4294; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101 4295; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 4296; GFX9-DL-NEXT: global_load_dword v1, v0, s[12:13] 4297; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] 4298; GFX9-DL-NEXT: global_load_dword v3, v0, s[10:11] 4299; GFX9-DL-NEXT: s_load_dword s3, s[14:15], 0x0 4300; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 4301; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 4302; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0 4303; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 4304; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s1 4305; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2 4306; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 4307; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 4308; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s3 4309; GFX9-DL-NEXT: global_store_dword v0, v1, s[14:15] 4310; GFX9-DL-NEXT: s_endpgm 4311; 4312; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0: 4313; GFX10-DL: ; %bb.0: ; %entry 4314; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 4315; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4316; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 4317; GFX10-DL-NEXT: s_clause 0x2 4318; GFX10-DL-NEXT: global_load_dword v1, v0, s[12:13] 4319; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] 4320; GFX10-DL-NEXT: global_load_dword v3, v0, s[10:11] 4321; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 4322; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 4323; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c 4324; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 4325; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc0c0c01 4326; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 4327; GFX10-DL-NEXT: v_or_b32_e32 v0, v0, v1 4328; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc020101 4329; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 4330; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 4331; GFX10-DL-NEXT: global_store_dword v2, v0, s[14:15] 4332; GFX10-DL-NEXT: s_endpgm 4333; 4334; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0: 4335; GFX11-DL: ; %bb.0: ; %entry 4336; GFX11-DL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 4337; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4338; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 4339; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4340; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 4341; GFX11-DL-NEXT: s_clause 0x2 4342; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] 4343; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] 4344; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 4345; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0 4346; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 4347; GFX11-DL-NEXT: v_perm_b32 v1, v1, v2, 0xc06010c 4348; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 4349; GFX11-DL-NEXT: v_perm_b32 v2, v0, v0, 0xc0c0c01 4350; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020101 4351; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 4352; GFX11-DL-NEXT: v_or_b32_e32 v1, v1, v2 4353; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 4354; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 4355; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 4356; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7] 4357; GFX11-DL-NEXT: s_endpgm 4358 ptr addrspace(1) %src2, 4359 ptr addrspace(1) %src3, 4360 ptr addrspace(1) nocapture %dst) { 4361entry: 4362 %idx = call i32 @llvm.amdgcn.workitem.id.x() 4363 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 4364 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 4365 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 4366 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 4367 %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx 4368 %vec3 = load <4 x i8>, ptr addrspace(1) %gep3 4369 4370 %v2e0 = extractelement <4 x i8> %vec2, i64 1 4371 %cv2e0 = zext i8 %v2e0 to i32 4372 %mul1 = mul nuw nsw i32 %cv2e0, %cv2e0 4373 4374 %v1e1 = extractelement <4 x i8> %vec1, i64 1 4375 %cv1e1 = zext i8 %v1e1 to i32 4376 %v2e1 = extractelement <4 x i8> %vec2, i64 1 4377 %cv2e1 = zext i8 %v2e1 to i32 4378 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 4379 4380 %v3e2 = extractelement <4 x i8> %vec3, i64 2 4381 %cv3e2 = zext i8 %v3e2 to i32 4382 %v2e2 = extractelement <4 x i8> %vec2, i64 2 4383 %cv2e2 = zext i8 %v2e2 to i32 4384 %mul3 = mul nuw nsw i32 %cv2e2, %cv3e2 4385 4386 4387 %acc = load i32, ptr addrspace(1) %dst, align 4 4388 %mad1 = add i32 %mul1, %acc 4389 %mad2 = add i32 %mad1, %mul2 4390 %mad3 = add i32 %mad2, %mul3 4391 4392 store i32 %mad3, ptr addrspace(1) %dst, align 4 4393 ret void 4394} 4395 4396define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, 4397; GFX7-LABEL: udot4_4src: 4398; GFX7: ; %bb.0: ; %entry 4399; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 4400; GFX7-NEXT: s_mov_b32 s3, 0xf000 4401; GFX7-NEXT: s_mov_b32 s18, 0 4402; GFX7-NEXT: s_mov_b32 s19, s3 4403; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4404; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4405; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9] 4406; GFX7-NEXT: v_mov_b32_e32 v1, 0 4407; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64 4408; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11] 4409; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64 4410; GFX7-NEXT: s_mov_b64 s[16:17], s[12:13] 4411; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 4412; GFX7-NEXT: s_mov_b64 s[16:17], s[14:15] 4413; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 4414; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11 4415; GFX7-NEXT: s_mov_b32 s2, -1 4416; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4417; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 4418; GFX7-NEXT: s_waitcnt vmcnt(3) 4419; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 4420; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 4421; GFX7-NEXT: s_waitcnt vmcnt(2) 4422; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v3 4423; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8 4424; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4425; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, s4 4426; GFX7-NEXT: s_waitcnt vmcnt(1) 4427; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v4 4428; GFX7-NEXT: v_bfe_u32 v4, v4, 8, 8 4429; GFX7-NEXT: v_mad_u32_u24 v1, v5, v3, v1 4430; GFX7-NEXT: s_waitcnt vmcnt(0) 4431; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v0 4432; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 4433; GFX7-NEXT: v_mad_u32_u24 v1, v2, v4, v1 4434; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1 4435; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 4436; GFX7-NEXT: s_endpgm 4437; 4438; GFX8-LABEL: udot4_4src: 4439; GFX8: ; %bb.0: ; %entry 4440; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 4441; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 4442; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 4443; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4444; GFX8-NEXT: v_mov_b32_e32 v1, s9 4445; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2 4446; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4447; GFX8-NEXT: flat_load_dword v3, v[0:1] 4448; GFX8-NEXT: v_mov_b32_e32 v1, s11 4449; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2 4450; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4451; GFX8-NEXT: flat_load_dword v4, v[0:1] 4452; GFX8-NEXT: v_mov_b32_e32 v1, s13 4453; GFX8-NEXT: v_add_u32_e32 v0, vcc, s12, v2 4454; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4455; GFX8-NEXT: flat_load_dword v5, v[0:1] 4456; GFX8-NEXT: v_mov_b32_e32 v1, s15 4457; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v2 4458; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4459; GFX8-NEXT: flat_load_dword v0, v[0:1] 4460; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 4461; GFX8-NEXT: s_waitcnt vmcnt(3) 4462; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 4463; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8 4464; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4465; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2 4466; GFX8-NEXT: s_waitcnt vmcnt(2) 4467; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 4468; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 8 4469; GFX8-NEXT: v_mad_u32_u24 v1, v3, v4, v1 4470; GFX8-NEXT: s_waitcnt vmcnt(1) 4471; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v5 4472; GFX8-NEXT: v_bfe_u32 v5, v5, 8, 8 4473; GFX8-NEXT: v_mad_u32_u24 v1, v6, v5, v1 4474; GFX8-NEXT: s_waitcnt vmcnt(0) 4475; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v0 4476; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8 4477; GFX8-NEXT: v_mad_u32_u24 v2, v7, v0, v1 4478; GFX8-NEXT: v_mov_b32_e32 v0, s0 4479; GFX8-NEXT: v_mov_b32_e32 v1, s1 4480; GFX8-NEXT: flat_store_dword v[0:1], v2 4481; GFX8-NEXT: s_endpgm 4482; 4483; GFX9-NODL-LABEL: udot4_4src: 4484; GFX9-NODL: ; %bb.0: ; %entry 4485; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 4486; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4487; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 4488; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 4489; GFX9-NODL-NEXT: global_load_dword v1, v0, s[8:9] 4490; GFX9-NODL-NEXT: global_load_dword v2, v0, s[10:11] 4491; GFX9-NODL-NEXT: global_load_dword v3, v0, s[12:13] 4492; GFX9-NODL-NEXT: global_load_dword v4, v0, s[14:15] 4493; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 4494; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 4495; GFX9-NODL-NEXT: s_waitcnt vmcnt(3) 4496; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1 4497; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 4498; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1 4499; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 4500; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1 4501; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 4502; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1 4503; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 4504; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v2 4505; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v4 4506; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 4507; GFX9-NODL-NEXT: s_endpgm 4508; 4509; GFX9-DL-LABEL: udot4_4src: 4510; GFX9-DL: ; %bb.0: ; %entry 4511; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 4512; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4513; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 4514; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501 4515; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c 4516; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 4517; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] 4518; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] 4519; GFX9-DL-NEXT: global_load_dword v3, v0, s[12:13] 4520; GFX9-DL-NEXT: global_load_dword v4, v0, s[14:15] 4521; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0400 4522; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 4523; GFX9-DL-NEXT: s_mov_b32 s5, 0x4000c0c 4524; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 4525; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 4526; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s2 4527; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s4 4528; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 4529; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s3 4530; GFX9-DL-NEXT: v_perm_b32 v2, v4, v3, s5 4531; GFX9-DL-NEXT: v_or_b32_e32 v3, v6, v5 4532; GFX9-DL-NEXT: v_or_b32_e32 v1, v2, v1 4533; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 4534; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v3, s6 4535; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 4536; GFX9-DL-NEXT: s_endpgm 4537; 4538; GFX10-DL-LABEL: udot4_4src: 4539; GFX10-DL: ; %bb.0: ; %entry 4540; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 4541; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4542; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 4543; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 4544; GFX10-DL-NEXT: s_clause 0x3 4545; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9] 4546; GFX10-DL-NEXT: global_load_dword v2, v0, s[10:11] 4547; GFX10-DL-NEXT: global_load_dword v3, v0, s[12:13] 4548; GFX10-DL-NEXT: global_load_dword v4, v0, s[14:15] 4549; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 4550; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 4551; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0501 4552; GFX10-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400 4553; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 4554; GFX10-DL-NEXT: v_perm_b32 v5, v4, v3, 0x5010c0c 4555; GFX10-DL-NEXT: v_perm_b32 v2, v4, v3, 0x4000c0c 4556; GFX10-DL-NEXT: v_or_b32_e32 v0, v5, v0 4557; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1 4558; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 4559; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 4560; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 4561; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] 4562; GFX10-DL-NEXT: s_endpgm 4563; 4564; GFX11-DL-LABEL: udot4_4src: 4565; GFX11-DL: ; %bb.0: ; %entry 4566; GFX11-DL-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 4567; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4568; GFX11-DL-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 4569; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 4570; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4571; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 4572; GFX11-DL-NEXT: s_clause 0x3 4573; GFX11-DL-NEXT: global_load_b32 v1, v0, s[8:9] 4574; GFX11-DL-NEXT: global_load_b32 v2, v0, s[10:11] 4575; GFX11-DL-NEXT: global_load_b32 v3, v0, s[12:13] 4576; GFX11-DL-NEXT: global_load_b32 v0, v0, s[14:15] 4577; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 4578; GFX11-DL-NEXT: s_waitcnt vmcnt(2) 4579; GFX11-DL-NEXT: v_perm_b32 v4, v2, v1, 0xc0c0501 4580; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400 4581; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 4582; GFX11-DL-NEXT: v_perm_b32 v5, v0, v3, 0x5010c0c 4583; GFX11-DL-NEXT: v_perm_b32 v0, v0, v3, 0x4000c0c 4584; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 4585; GFX11-DL-NEXT: v_or_b32_e32 v2, v5, v4 4586; GFX11-DL-NEXT: v_or_b32_e32 v0, v0, v1 4587; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0 4588; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 4589; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 4590; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, s2 4591; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1] 4592; GFX11-DL-NEXT: s_endpgm 4593 ptr addrspace(1) %src2, 4594 ptr addrspace(1) %src3, 4595 ptr addrspace(1) %src4, 4596 ptr addrspace(1) nocapture %dst) { 4597entry: 4598 %idx = call i32 @llvm.amdgcn.workitem.id.x() 4599 4600 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 4601 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 4602 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 4603 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 4604 %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx 4605 %vec3 = load <4 x i8>, ptr addrspace(1) %gep3 4606 %gep4 = getelementptr <4 x i8>, ptr addrspace(1) %src4, i32 %idx 4607 %vec4 = load <4 x i8>, ptr addrspace(1) %gep4 4608 4609 4610 %v1e0 = extractelement <4 x i8> %vec1, i64 0 4611 %cv1e0 = zext i8 %v1e0 to i32 4612 %v1e1 = extractelement <4 x i8> %vec1, i64 1 4613 %cv1e1 = zext i8 %v1e1 to i32 4614 %mul1 = mul nuw nsw i32 %cv1e0, %cv1e1 4615 4616 %v2e0 = extractelement <4 x i8> %vec2, i64 0 4617 %cv2e0 = zext i8 %v2e0 to i32 4618 %v2e1 = extractelement <4 x i8> %vec2, i64 1 4619 %cv2e1 = zext i8 %v2e1 to i32 4620 %mul2 = mul nuw nsw i32 %cv2e0, %cv2e1 4621 4622 %v3e0 = extractelement <4 x i8> %vec3, i64 0 4623 %cv3e0 = zext i8 %v3e0 to i32 4624 %v3e1 = extractelement <4 x i8> %vec3, i64 1 4625 %cv3e1 = zext i8 %v3e1 to i32 4626 %mul3 = mul nuw nsw i32 %cv3e0, %cv3e1 4627 4628 %v4e0 = extractelement <4 x i8> %vec4, i64 0 4629 %cv4e0 = zext i8 %v4e0 to i32 4630 %v4e1 = extractelement <4 x i8> %vec4, i64 1 4631 %cv4e1 = zext i8 %v4e1 to i32 4632 %mul4 = mul nuw nsw i32 %cv4e0, %cv4e1 4633 4634 4635 %acc = load i32, ptr addrspace(1) %dst, align 4 4636 %mad1 = add i32 %mul1, %acc 4637 %mad2 = add i32 %mad1, %mul2 4638 %mad3 = add i32 %mad2, %mul3 4639 %mad4 = add i32 %mad3, %mul4 4640 4641 store i32 %mad4, ptr addrspace(1) %dst, align 4 4642 ret void 4643} 4644 4645define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, 4646; GFX7-LABEL: udot4_acc32_multi: 4647; GFX7: ; %bb.0: ; %entry 4648; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 4649; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 4650; GFX7-NEXT: s_mov_b32 s3, 0xf000 4651; GFX7-NEXT: s_mov_b32 s6, 0 4652; GFX7-NEXT: s_mov_b32 s7, s3 4653; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4654; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 4655; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 4656; GFX7-NEXT: v_mov_b32_e32 v1, 0 4657; GFX7-NEXT: s_mov_b64 s[8:9], s[10:11] 4658; GFX7-NEXT: s_mov_b64 s[10:11], s[6:7] 4659; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 4660; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 4661; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 4662; GFX7-NEXT: s_mov_b32 s2, -1 4663; GFX7-NEXT: s_waitcnt vmcnt(1) 4664; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 4665; GFX7-NEXT: s_waitcnt vmcnt(0) 4666; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 4667; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 8 4668; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 4669; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4670; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, s4 4671; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v3 4672; GFX7-NEXT: v_mad_u32_u24 v1, v7, v8, v1 4673; GFX7-NEXT: v_bfe_u32 v11, v3, 16, 8 4674; GFX7-NEXT: v_mad_u32_u24 v1, v9, v4, v1 4675; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 4676; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 4677; GFX7-NEXT: v_mad_u32_u24 v1, v11, v8, v1 4678; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 4679; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 4680; GFX7-NEXT: v_mad_u32_u24 v1, v5, v6, v1 4681; GFX7-NEXT: v_bfe_u32 v10, v3, 8, 8 4682; GFX7-NEXT: v_mad_u32_u24 v1, v2, v0, v1 4683; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v3 4684; GFX7-NEXT: v_mad_u32_u24 v1, v10, v6, v1 4685; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1 4686; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 4687; GFX7-NEXT: s_endpgm 4688; 4689; GFX8-LABEL: udot4_acc32_multi: 4690; GFX8: ; %bb.0: ; %entry 4691; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4692; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 4693; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 4694; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4695; GFX8-NEXT: v_mov_b32_e32 v1, s1 4696; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 4697; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4698; GFX8-NEXT: v_mov_b32_e32 v3, s3 4699; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 4700; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 4701; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 4702; GFX8-NEXT: flat_load_dword v2, v[2:3] 4703; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 4704; GFX8-NEXT: s_waitcnt vmcnt(1) 4705; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v0 4706; GFX8-NEXT: s_waitcnt vmcnt(0) 4707; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2 4708; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 4709; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8 4710; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4711; GFX8-NEXT: v_mad_u32_u24 v3, v3, v4, s0 4712; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v1 4713; GFX8-NEXT: v_mad_u32_u24 v3, v7, v8, v3 4714; GFX8-NEXT: v_bfe_u32 v11, v1, 16, 8 4715; GFX8-NEXT: v_mad_u32_u24 v3, v9, v4, v3 4716; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 4717; GFX8-NEXT: v_bfe_u32 v6, v2, 8, 8 4718; GFX8-NEXT: v_mad_u32_u24 v3, v11, v8, v3 4719; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 4720; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 4721; GFX8-NEXT: v_mad_u32_u24 v3, v5, v6, v3 4722; GFX8-NEXT: v_bfe_u32 v10, v1, 8, 8 4723; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v3 4724; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1 4725; GFX8-NEXT: v_mad_u32_u24 v0, v10, v6, v0 4726; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0 4727; GFX8-NEXT: v_mov_b32_e32 v0, s4 4728; GFX8-NEXT: v_mov_b32_e32 v1, s5 4729; GFX8-NEXT: flat_store_dword v[0:1], v2 4730; GFX8-NEXT: s_endpgm 4731; 4732; GFX9-NODL-LABEL: udot4_acc32_multi: 4733; GFX9-NODL: ; %bb.0: ; %entry 4734; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4735; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4736; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 4737; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 4738; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 4739; GFX9-NODL-NEXT: global_load_dword v3, v2, s[2:3] 4740; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 4741; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 4742; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 4743; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 4744; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 4745; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 4746; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 4747; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v7, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 4748; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v8, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 4749; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v9, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 4750; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 4751; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 4752; GFX9-NODL-NEXT: v_add3_u32 v3, v4, s0, v6 4753; GFX9-NODL-NEXT: v_add3_u32 v3, v3, v7, v9 4754; GFX9-NODL-NEXT: v_add3_u32 v0, v5, v3, v0 4755; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v8, v1 4756; GFX9-NODL-NEXT: global_store_dword v2, v0, s[6:7] 4757; GFX9-NODL-NEXT: s_endpgm 4758; 4759; GFX9-DL-LABEL: udot4_acc32_multi: 4760; GFX9-DL: ; %bb.0: ; %entry 4761; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4762; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4763; GFX9-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 4764; GFX9-DL-NEXT: s_mov_b32 s4, 0x3010301 4765; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 4766; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 4767; GFX9-DL-NEXT: global_load_dword v3, v2, s[2:3] 4768; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 4769; GFX9-DL-NEXT: s_mov_b32 s0, 0x6040200 4770; GFX9-DL-NEXT: s_mov_b32 s1, 0x2000200 4771; GFX9-DL-NEXT: s_mov_b32 s2, 0x7050301 4772; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0 4773; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 4774; GFX9-DL-NEXT: v_perm_b32 v4, v1, v0, s0 4775; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 4776; GFX9-DL-NEXT: v_perm_b32 v5, v3, v3, s1 4777; GFX9-DL-NEXT: v_perm_b32 v0, v1, v0, s2 4778; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 4779; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v4, v5, s3 4780; GFX9-DL-NEXT: v_perm_b32 v3, v3, v3, s4 4781; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v0, v3, v1 4782; GFX9-DL-NEXT: global_store_dword v2, v0, s[6:7] 4783; GFX9-DL-NEXT: s_endpgm 4784; 4785; GFX10-DL-LABEL: udot4_acc32_multi: 4786; GFX10-DL: ; %bb.0: ; %entry 4787; GFX10-DL-NEXT: s_clause 0x1 4788; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4789; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4790; GFX10-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 4791; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 4792; GFX10-DL-NEXT: s_clause 0x1 4793; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 4794; GFX10-DL-NEXT: global_load_dword v3, v2, s[2:3] 4795; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 4796; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 4797; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 4798; GFX10-DL-NEXT: v_perm_b32 v2, v1, v0, 0x6040200 4799; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 4800; GFX10-DL-NEXT: v_perm_b32 v4, v3, v3, 0x2000200 4801; GFX10-DL-NEXT: v_perm_b32 v0, v1, v0, 0x7050301 4802; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 4803; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v4, s0 4804; GFX10-DL-NEXT: v_perm_b32 v2, v3, v3, 0x3010301 4805; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 4806; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v1 4807; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] 4808; GFX10-DL-NEXT: s_endpgm 4809; 4810; GFX11-DL-LABEL: udot4_acc32_multi: 4811; GFX11-DL: ; %bb.0: ; %entry 4812; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4813; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4814; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 4815; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 4816; GFX11-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 4817; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 4818; GFX11-DL-NEXT: s_clause 0x1 4819; GFX11-DL-NEXT: global_load_b64 v[0:1], v2, s[0:1] 4820; GFX11-DL-NEXT: global_load_b32 v2, v2, s[2:3] 4821; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 4822; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 4823; GFX11-DL-NEXT: v_perm_b32 v3, v1, v0, 0x6040200 4824; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 4825; GFX11-DL-NEXT: v_perm_b32 v4, v2, v2, 0x2000200 4826; GFX11-DL-NEXT: v_perm_b32 v0, v1, v0, 0x7050301 4827; GFX11-DL-NEXT: v_perm_b32 v2, v2, v2, 0x3010301 4828; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 4829; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4830; GFX11-DL-NEXT: v_dot4_u32_u8 v1, v3, v4, s0 4831; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0 4832; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v1 4833; GFX11-DL-NEXT: global_store_b32 v3, v0, s[4:5] 4834; GFX11-DL-NEXT: s_endpgm 4835 ptr addrspace(1) %src2, 4836 ptr addrspace(1) nocapture %dst) { 4837entry: 4838 %idx = call i32 @llvm.amdgcn.workitem.id.x() 4839 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx 4840 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 4841 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx 4842 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 4843 4844 %v1e0 = extractelement <8 x i8> %vec1, i64 0 4845 %cv1e0 = zext i8 %v1e0 to i32 4846 %v2e0 = extractelement <8 x i8> %vec2, i64 0 4847 %cv2e0 = zext i8 %v2e0 to i32 4848 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 4849 4850 %v1e1 = extractelement <8 x i8> %vec1, i64 1 4851 %cv1e1 = zext i8 %v1e1 to i32 4852 %v2e1 = extractelement <8 x i8> %vec2, i64 1 4853 %cv2e1 = zext i8 %v2e1 to i32 4854 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 4855 4856 %v1e2 = extractelement <8 x i8> %vec1, i64 2 4857 %cv1e2 = zext i8 %v1e2 to i32 4858 %v2e2 = extractelement <8 x i8> %vec2, i64 2 4859 %cv2e2 = zext i8 %v2e2 to i32 4860 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 4861 4862 %v1e3 = extractelement <8 x i8> %vec1, i64 3 4863 %cv1e3 = zext i8 %v1e3 to i32 4864 %v2e3 = extractelement <8 x i8> %vec2, i64 3 4865 %cv2e3 = zext i8 %v2e3 to i32 4866 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 4867 4868 %v1e4 = extractelement <8 x i8> %vec1, i64 4 4869 %cv1e4 = zext i8 %v1e4 to i32 4870 %v2e4 = extractelement <8 x i8> %vec2, i64 4 4871 %cv2e4 = zext i8 %v2e4 to i32 4872 %mul5 = mul nuw nsw i32 %cv1e4, %cv2e0 4873 4874 %v1e5 = extractelement <8 x i8> %vec1, i64 5 4875 %cv1e5 = zext i8 %v1e5 to i32 4876 %v2e5 = extractelement <8 x i8> %vec2, i64 5 4877 %cv2e5 = zext i8 %v2e5 to i32 4878 %mul6 = mul nuw nsw i32 %cv1e5, %cv2e1 4879 4880 %v1e6 = extractelement <8 x i8> %vec1, i64 6 4881 %cv1e6 = zext i8 %v1e6 to i32 4882 %v2e6 = extractelement <8 x i8> %vec2, i64 6 4883 %cv2e6 = zext i8 %v2e6 to i32 4884 %mul7 = mul nuw nsw i32 %cv1e6, %cv2e2 4885 4886 %v1e7 = extractelement <8 x i8> %vec1, i64 7 4887 %cv1e7 = zext i8 %v1e7 to i32 4888 %v2e7 = extractelement <8 x i8> %vec2, i64 7 4889 %cv2e7 = zext i8 %v2e7 to i32 4890 %mul8 = mul nuw nsw i32 %cv1e7, %cv2e3 4891 4892 %acc = load i32, ptr addrspace(1) %dst, align 4 4893 %mad11 = add i32 %mul1, %acc 4894 %mad21 = add i32 %mad11, %mul3 4895 %mad31 = add i32 %mad21, %mul5 4896 %mad41 = add i32 %mad31, %mul7 4897 %mad12 = add i32 %mul2, %mad41 4898 %mad22 = add i32 %mad12, %mul4 4899 %mad32 = add i32 %mad22, %mul6 4900 %mad42 = add i32 %mad32, %mul8 4901 4902 store i32 %mad42, ptr addrspace(1) %dst, align 4 4903 ret void 4904} 4905 4906define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, 4907; GFX7-LABEL: idot4_acc32_hilo: 4908; GFX7: ; %bb.0: ; %entry 4909; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 4910; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 4911; GFX7-NEXT: s_mov_b32 s3, 0xf000 4912; GFX7-NEXT: s_mov_b32 s6, 0 4913; GFX7-NEXT: s_mov_b32 s7, s3 4914; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 4915; GFX7-NEXT: v_mov_b32_e32 v1, 0 4916; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4917; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 4918; GFX7-NEXT: s_mov_b64 s[10:11], s[6:7] 4919; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 4920; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 4921; GFX7-NEXT: s_mov_b32 s2, -1 4922; GFX7-NEXT: s_waitcnt vmcnt(1) 4923; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 4924; GFX7-NEXT: s_waitcnt vmcnt(0) 4925; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 4926; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 4927; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v0 4928; GFX7-NEXT: v_mul_u32_u24_e32 v4, v4, v5 4929; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 8 4930; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 4931; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v4 4932; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 4933; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 4934; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1 4935; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 4936; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 4937; GFX7-NEXT: s_endpgm 4938; 4939; GFX8-LABEL: idot4_acc32_hilo: 4940; GFX8: ; %bb.0: ; %entry 4941; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4942; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 4943; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 4944; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4945; GFX8-NEXT: v_mov_b32_e32 v1, s1 4946; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 4947; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 4948; GFX8-NEXT: v_mov_b32_e32 v1, s3 4949; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 4950; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4951; GFX8-NEXT: flat_load_dword v4, v[0:1] 4952; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 4953; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 4954; GFX8-NEXT: flat_load_dword v2, v[0:1] 4955; GFX8-NEXT: v_mov_b32_e32 v0, s4 4956; GFX8-NEXT: v_mov_b32_e32 v1, s5 4957; GFX8-NEXT: s_waitcnt vmcnt(1) 4958; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 4959; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8 4960; GFX8-NEXT: s_waitcnt vmcnt(0) 4961; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v2 4962; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 4963; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8 4964; GFX8-NEXT: v_mad_u32_u24 v3, v6, v3, v7 4965; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 4966; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 4967; GFX8-NEXT: v_mad_u32_u24 v3, v8, v5, v3 4968; GFX8-NEXT: v_mad_u32_u24 v2, v2, v4, v3 4969; GFX8-NEXT: flat_store_dword v[0:1], v2 4970; GFX8-NEXT: s_endpgm 4971; 4972; GFX9-NODL-LABEL: idot4_acc32_hilo: 4973; GFX9-NODL: ; %bb.0: ; %entry 4974; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4975; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4976; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 4977; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 4978; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 4979; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 4980; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 4981; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 4982; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 4983; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 4984; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2 4985; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 4986; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 4987; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 4988; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 4989; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 4990; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 4991; GFX9-NODL-NEXT: s_endpgm 4992; 4993; GFX9-DL-LABEL: idot4_acc32_hilo: 4994; GFX9-DL: ; %bb.0: ; %entry 4995; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4996; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4997; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 4998; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 4999; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 5000; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 5001; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 5002; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 5003; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 5004; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 5005; GFX9-DL-NEXT: s_endpgm 5006; 5007; GFX10-DL-LABEL: idot4_acc32_hilo: 5008; GFX10-DL: ; %bb.0: ; %entry 5009; GFX10-DL-NEXT: s_clause 0x1 5010; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5011; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5012; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5013; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 5014; GFX10-DL-NEXT: s_clause 0x1 5015; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 5016; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 5017; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 5018; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 5019; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 5020; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] 5021; GFX10-DL-NEXT: s_endpgm 5022; 5023; GFX11-DL-LABEL: idot4_acc32_hilo: 5024; GFX11-DL: ; %bb.0: ; %entry 5025; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5026; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 5027; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 5028; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 5029; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 5030; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5031; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 5032; GFX11-DL-NEXT: s_clause 0x1 5033; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] offset:4 5034; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 5035; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 5036; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 5037; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 5038; GFX11-DL-NEXT: s_endpgm 5039 ptr addrspace(1) %src2, 5040 ptr addrspace(1) nocapture %dst) { 5041entry: 5042 %idx = call i32 @llvm.amdgcn.workitem.id.x() 5043 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx 5044 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 5045 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx 5046 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 5047 5048 %v1e0 = extractelement <8 x i8> %vec1, i64 4 5049 %cv1e0 = zext i8 %v1e0 to i32 5050 %v2e0 = extractelement <8 x i8> %vec2, i64 0 5051 %cv2e0 = zext i8 %v2e0 to i32 5052 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 5053 5054 %v1e1 = extractelement <8 x i8> %vec1, i64 5 5055 %cv1e1 = zext i8 %v1e1 to i32 5056 %v2e1 = extractelement <8 x i8> %vec2, i64 1 5057 %cv2e1 = zext i8 %v2e1 to i32 5058 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 5059 5060 %v1e2 = extractelement <8 x i8> %vec1, i64 6 5061 %cv1e2 = zext i8 %v1e2 to i32 5062 %v2e2 = extractelement <8 x i8> %vec2, i64 2 5063 %cv2e2 = zext i8 %v2e2 to i32 5064 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 5065 5066 %v1e3 = extractelement <8 x i8> %vec1, i64 7 5067 %cv1e3 = zext i8 %v1e3 to i32 5068 %v2e3 = extractelement <8 x i8> %vec2, i64 3 5069 %cv2e3 = zext i8 %v2e3 to i32 5070 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 5071 5072 %add1 = add i32 %mul1, 0 5073 %add2 = add i32 %add1, %mul2 5074 %add3 = add i32 %add2, %mul3 5075 %add4 = add i32 %add3, %mul4 5076 store i32 %add4, ptr addrspace(1) %dst, align 4 5077 ret void 5078} 5079 5080define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, 5081; GFX7-LABEL: idot4_acc32_lohi: 5082; GFX7: ; %bb.0: ; %entry 5083; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 5084; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 5085; GFX7-NEXT: s_mov_b32 s3, 0xf000 5086; GFX7-NEXT: s_mov_b32 s6, 0 5087; GFX7-NEXT: s_mov_b32 s7, s3 5088; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5089; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 5090; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5091; GFX7-NEXT: v_mov_b32_e32 v1, 0 5092; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 5093; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 5094; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 5095; GFX7-NEXT: s_mov_b32 s2, -1 5096; GFX7-NEXT: s_waitcnt vmcnt(1) 5097; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 5098; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 5099; GFX7-NEXT: s_waitcnt vmcnt(0) 5100; GFX7-NEXT: v_bfe_u32 v6, v0, 16, 8 5101; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 5102; GFX7-NEXT: v_mul_u32_u24_e32 v3, v3, v6 5103; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 5104; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 5105; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 5106; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 5107; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 5108; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 5109; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 5110; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 5111; GFX7-NEXT: s_endpgm 5112; 5113; GFX8-LABEL: idot4_acc32_lohi: 5114; GFX8: ; %bb.0: ; %entry 5115; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5116; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 5117; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 5118; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5119; GFX8-NEXT: v_mov_b32_e32 v1, s1 5120; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 5121; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5122; GFX8-NEXT: v_mov_b32_e32 v3, s3 5123; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 5124; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 5125; GFX8-NEXT: flat_load_dword v4, v[0:1] 5126; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 5127; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 5128; GFX8-NEXT: flat_load_dword v2, v[0:1] 5129; GFX8-NEXT: v_mov_b32_e32 v0, s4 5130; GFX8-NEXT: v_mov_b32_e32 v1, s5 5131; GFX8-NEXT: s_waitcnt vmcnt(1) 5132; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 5133; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8 5134; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v4 5135; GFX8-NEXT: s_waitcnt vmcnt(0) 5136; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 5137; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2 5138; GFX8-NEXT: v_bfe_u32 v8, v2, 8, 8 5139; GFX8-NEXT: v_mad_u32_u24 v3, v3, v7, v4 5140; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 5141; GFX8-NEXT: v_mad_u32_u24 v3, v5, v8, v3 5142; GFX8-NEXT: v_mad_u32_u24 v2, v6, v2, v3 5143; GFX8-NEXT: flat_store_dword v[0:1], v2 5144; GFX8-NEXT: s_endpgm 5145; 5146; GFX9-NODL-LABEL: idot4_acc32_lohi: 5147; GFX9-NODL: ; %bb.0: ; %entry 5148; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5149; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5150; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5151; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 5152; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 5153; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 5154; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 5155; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 5156; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 5157; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 5158; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 5159; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2 5160; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 5161; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_0 5162; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 5163; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 5164; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 5165; GFX9-NODL-NEXT: s_endpgm 5166; 5167; GFX9-DL-LABEL: idot4_acc32_lohi: 5168; GFX9-DL: ; %bb.0: ; %entry 5169; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5170; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5171; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5172; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 5173; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] offset:4 5174; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 5175; GFX9-DL-NEXT: s_mov_b32 s0, 0x10302 5176; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001 5177; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 5178; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 5179; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 5180; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 5181; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 5182; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 5183; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 5184; GFX9-DL-NEXT: s_endpgm 5185; 5186; GFX10-DL-LABEL: idot4_acc32_lohi: 5187; GFX10-DL: ; %bb.0: ; %entry 5188; GFX10-DL-NEXT: s_clause 0x1 5189; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5190; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5191; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5192; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 5193; GFX10-DL-NEXT: s_clause 0x1 5194; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] offset:4 5195; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 5196; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 5197; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0x10302 5198; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 5199; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3020001 5200; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 5201; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 5202; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 5203; GFX10-DL-NEXT: s_endpgm 5204; 5205; GFX11-DL-LABEL: idot4_acc32_lohi: 5206; GFX11-DL: ; %bb.0: ; %entry 5207; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5208; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 5209; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 5210; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 5211; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 5212; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5213; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 5214; GFX11-DL-NEXT: s_clause 0x1 5215; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] offset:4 5216; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] 5217; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 5218; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x10302 5219; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 5220; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001 5221; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 5222; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 5223; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 5224; GFX11-DL-NEXT: s_endpgm 5225 ptr addrspace(1) %src2, 5226 ptr addrspace(1) nocapture %dst) { 5227entry: 5228 %idx = call i32 @llvm.amdgcn.workitem.id.x() 5229 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx 5230 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 5231 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx 5232 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 5233 5234 %v1e0 = extractelement <8 x i8> %vec1, i64 0 5235 %cv1e0 = zext i8 %v1e0 to i32 5236 %v2e0 = extractelement <8 x i8> %vec2, i64 7 5237 %cv2e0 = zext i8 %v2e0 to i32 5238 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 5239 5240 %v1e1 = extractelement <8 x i8> %vec1, i64 1 5241 %cv1e1 = zext i8 %v1e1 to i32 5242 %v2e1 = extractelement <8 x i8> %vec2, i64 6 5243 %cv2e1 = zext i8 %v2e1 to i32 5244 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 5245 5246 %v1e2 = extractelement <8 x i8> %vec1, i64 2 5247 %cv1e2 = zext i8 %v1e2 to i32 5248 %v2e2 = extractelement <8 x i8> %vec2, i64 5 5249 %cv2e2 = zext i8 %v2e2 to i32 5250 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 5251 5252 %v1e3 = extractelement <8 x i8> %vec1, i64 3 5253 %cv1e3 = zext i8 %v1e3 to i32 5254 %v2e3 = extractelement <8 x i8> %vec2, i64 4 5255 %cv2e3 = zext i8 %v2e3 to i32 5256 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 5257 5258 %add1 = add i32 %mul1, 0 5259 %add2 = add i32 %add1, %mul2 5260 %add3 = add i32 %add2, %mul3 5261 %add4 = add i32 %add3, %mul4 5262 store i32 %add4, ptr addrspace(1) %dst, align 4 5263 ret void 5264} 5265 5266define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, 5267; GFX7-LABEL: idot4_acc32_hihi: 5268; GFX7: ; %bb.0: ; %entry 5269; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 5270; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 5271; GFX7-NEXT: s_mov_b32 s3, 0xf000 5272; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5273; GFX7-NEXT: v_mov_b32_e32 v1, 0 5274; GFX7-NEXT: s_mov_b32 s6, 0 5275; GFX7-NEXT: s_mov_b32 s7, s3 5276; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5277; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 5278; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:4 5279; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 5280; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 5281; GFX7-NEXT: s_mov_b32 s2, -1 5282; GFX7-NEXT: s_waitcnt vmcnt(1) 5283; GFX7-NEXT: v_bfe_u32 v3, v2, 16, 8 5284; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 5285; GFX7-NEXT: s_waitcnt vmcnt(0) 5286; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 5287; GFX7-NEXT: v_bfe_u32 v5, v0, 16, 8 5288; GFX7-NEXT: v_mul_u32_u24_e32 v3, v3, v6 5289; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 5290; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v0 5291; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 5292; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 5293; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 5294; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 5295; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 5296; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 5297; GFX7-NEXT: s_endpgm 5298; 5299; GFX8-LABEL: idot4_acc32_hihi: 5300; GFX8: ; %bb.0: ; %entry 5301; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5302; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 5303; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5304; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5305; GFX8-NEXT: v_mov_b32_e32 v1, s1 5306; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 5307; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5308; GFX8-NEXT: v_mov_b32_e32 v3, s3 5309; GFX8-NEXT: v_add_u32_e32 v4, vcc, s2, v0 5310; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 5311; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 5312; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5313; GFX8-NEXT: flat_load_dword v2, v[0:1] 5314; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 5315; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 5316; GFX8-NEXT: flat_load_dword v3, v[0:1] 5317; GFX8-NEXT: v_mov_b32_e32 v0, s4 5318; GFX8-NEXT: v_mov_b32_e32 v1, s5 5319; GFX8-NEXT: s_waitcnt vmcnt(1) 5320; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2 5321; GFX8-NEXT: v_bfe_u32 v7, v2, 8, 8 5322; GFX8-NEXT: s_waitcnt vmcnt(0) 5323; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 8 5324; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 5325; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v3 5326; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6 5327; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 5328; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8 5329; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4 5330; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4 5331; GFX8-NEXT: flat_store_dword v[0:1], v2 5332; GFX8-NEXT: s_endpgm 5333; 5334; GFX9-NODL-LABEL: idot4_acc32_hihi: 5335; GFX9-NODL: ; %bb.0: ; %entry 5336; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5337; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5338; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5339; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 5340; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 5341; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 5342; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 5343; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 5344; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 5345; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 5346; GFX9-NODL-NEXT: v_bfe_u32 v4, v2, 16, 8 5347; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 5348; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3 5349; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 5350; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 5351; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 5352; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 5353; GFX9-NODL-NEXT: s_endpgm 5354; 5355; GFX9-DL-LABEL: idot4_acc32_hihi: 5356; GFX9-DL: ; %bb.0: ; %entry 5357; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5358; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5359; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5360; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 5361; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] offset:4 5362; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] offset:4 5363; GFX9-DL-NEXT: s_mov_b32 s0, 0x1030200 5364; GFX9-DL-NEXT: s_mov_b32 s1, 0x3010002 5365; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 5366; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 5367; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 5368; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 5369; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 5370; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 5371; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 5372; GFX9-DL-NEXT: s_endpgm 5373; 5374; GFX10-DL-LABEL: idot4_acc32_hihi: 5375; GFX10-DL: ; %bb.0: ; %entry 5376; GFX10-DL-NEXT: s_clause 0x1 5377; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5378; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5379; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5380; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 5381; GFX10-DL-NEXT: s_clause 0x1 5382; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] offset:4 5383; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] offset:4 5384; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 5385; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0x1030200 5386; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 5387; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3010002 5388; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 5389; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 5390; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 5391; GFX10-DL-NEXT: s_endpgm 5392; 5393; GFX11-DL-LABEL: idot4_acc32_hihi: 5394; GFX11-DL: ; %bb.0: ; %entry 5395; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5396; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 5397; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 5398; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 5399; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 5400; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5401; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 5402; GFX11-DL-NEXT: s_clause 0x1 5403; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] offset:4 5404; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] offset:4 5405; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 5406; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x1030200 5407; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 5408; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3010002 5409; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 5410; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 5411; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 5412; GFX11-DL-NEXT: s_endpgm 5413 ptr addrspace(1) %src2, 5414 ptr addrspace(1) nocapture %dst) { 5415entry: 5416 %idx = call i32 @llvm.amdgcn.workitem.id.x() 5417 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx 5418 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 5419 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx 5420 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 5421 5422 %v1e0 = extractelement <8 x i8> %vec1, i64 4 5423 %cv1e0 = zext i8 %v1e0 to i32 5424 %v2e0 = extractelement <8 x i8> %vec2, i64 6 5425 %cv2e0 = zext i8 %v2e0 to i32 5426 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 5427 5428 %v1e1 = extractelement <8 x i8> %vec1, i64 6 5429 %cv1e1 = zext i8 %v1e1 to i32 5430 %v2e1 = extractelement <8 x i8> %vec2, i64 4 5431 %cv2e1 = zext i8 %v2e1 to i32 5432 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 5433 5434 %v1e2 = extractelement <8 x i8> %vec1, i64 5 5435 %cv1e2 = zext i8 %v1e2 to i32 5436 %v2e2 = extractelement <8 x i8> %vec2, i64 7 5437 %cv2e2 = zext i8 %v2e2 to i32 5438 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 5439 5440 %v1e3 = extractelement <8 x i8> %vec1, i64 7 5441 %cv1e3 = zext i8 %v1e3 to i32 5442 %v2e3 = extractelement <8 x i8> %vec2, i64 5 5443 %cv2e3 = zext i8 %v2e3 to i32 5444 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 5445 5446 %add1 = add i32 %mul1, 0 5447 %add2 = add i32 %add1, %mul2 5448 %add3 = add i32 %add2, %mul3 5449 %add4 = add i32 %add3, %mul4 5450 store i32 %add4, ptr addrspace(1) %dst, align 4 5451 ret void 5452} 5453 5454define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, 5455; GFX7-LABEL: idot4_acc32_v8i8: 5456; GFX7: ; %bb.0: ; %entry 5457; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 5458; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 5459; GFX7-NEXT: s_mov_b32 s7, 0xf000 5460; GFX7-NEXT: s_mov_b32 s2, 0 5461; GFX7-NEXT: s_mov_b32 s3, s7 5462; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5463; GFX7-NEXT: v_mov_b32_e32 v1, 0 5464; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5465; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 5466; GFX7-NEXT: s_mov_b32 s6, -1 5467; GFX7-NEXT: s_waitcnt vmcnt(0) 5468; GFX7-NEXT: v_bfe_u32 v4, v0, 8, 8 5469; GFX7-NEXT: v_bfe_u32 v5, v1, 8, 8 5470; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 5471; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v1 5472; GFX7-NEXT: v_mul_u32_u24_e32 v4, v4, v5 5473; GFX7-NEXT: v_bfe_u32 v6, v0, 16, 8 5474; GFX7-NEXT: v_bfe_u32 v7, v1, 16, 8 5475; GFX7-NEXT: v_mad_u32_u24 v2, v2, v3, v4 5476; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 5477; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 5478; GFX7-NEXT: v_mad_u32_u24 v2, v6, v7, v2 5479; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, v2 5480; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 5481; GFX7-NEXT: s_endpgm 5482; 5483; GFX8-LABEL: idot4_acc32_v8i8: 5484; GFX8: ; %bb.0: ; %entry 5485; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5486; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 5487; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5488; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5489; GFX8-NEXT: v_mov_b32_e32 v1, s1 5490; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 5491; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5492; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 5493; GFX8-NEXT: s_waitcnt vmcnt(0) 5494; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 5495; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 5496; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 5497; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 8 5498; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 8 5499; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4 5500; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 5501; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1 5502; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 5503; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2 5504; GFX8-NEXT: v_mov_b32_e32 v0, s2 5505; GFX8-NEXT: v_mov_b32_e32 v1, s3 5506; GFX8-NEXT: flat_store_dword v[0:1], v2 5507; GFX8-NEXT: s_endpgm 5508; 5509; GFX9-NODL-LABEL: idot4_acc32_v8i8: 5510; GFX9-NODL: ; %bb.0: ; %entry 5511; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5512; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 5513; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5514; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 5515; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 5516; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] 5517; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 5518; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v0 5519; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 5520; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 5521; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 5522; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 5523; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v3, v4, v5 5524; GFX9-NODL-NEXT: v_add3_u32 v0, v1, v6, v0 5525; GFX9-NODL-NEXT: global_store_dword v2, v0, s[2:3] 5526; GFX9-NODL-NEXT: s_endpgm 5527; 5528; GFX9-DL-LABEL: idot4_acc32_v8i8: 5529; GFX9-DL: ; %bb.0: ; %entry 5530; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5531; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 5532; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5533; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0 5534; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 5535; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] 5536; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 5537; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 5538; GFX9-DL-NEXT: global_store_dword v2, v0, s[2:3] 5539; GFX9-DL-NEXT: s_endpgm 5540; 5541; GFX10-DL-LABEL: idot4_acc32_v8i8: 5542; GFX10-DL: ; %bb.0: ; %entry 5543; GFX10-DL-NEXT: s_clause 0x1 5544; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5545; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 5546; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5547; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 5548; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 5549; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] 5550; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 5551; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 5552; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] 5553; GFX10-DL-NEXT: s_endpgm 5554; 5555; GFX11-DL-LABEL: idot4_acc32_v8i8: 5556; GFX11-DL: ; %bb.0: ; %entry 5557; GFX11-DL-NEXT: s_clause 0x1 5558; GFX11-DL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 5559; GFX11-DL-NEXT: s_load_b64 s[2:3], s[4:5], 0x34 5560; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 5561; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 5562; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 5563; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5564; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 5565; GFX11-DL-NEXT: global_load_b64 v[0:1], v0, s[0:1] 5566; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 5567; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 5568; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3] 5569; GFX11-DL-NEXT: s_endpgm 5570 ptr addrspace(1) %src2, 5571 ptr addrspace(1) nocapture %dst) { 5572entry: 5573 %idx = call i32 @llvm.amdgcn.workitem.id.x() 5574 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx 5575 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 5576 5577 5578 %v1e0 = extractelement <8 x i8> %vec1, i64 0 5579 %cv1e0 = zext i8 %v1e0 to i32 5580 %v2e0 = extractelement <8 x i8> %vec1, i64 4 5581 %cv2e0 = zext i8 %v2e0 to i32 5582 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 5583 5584 %v1e1 = extractelement <8 x i8> %vec1, i64 1 5585 %cv1e1 = zext i8 %v1e1 to i32 5586 %v2e1 = extractelement <8 x i8> %vec1, i64 5 5587 %cv2e1 = zext i8 %v2e1 to i32 5588 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 5589 5590 %v1e2 = extractelement <8 x i8> %vec1, i64 2 5591 %cv1e2 = zext i8 %v1e2 to i32 5592 %v2e2 = extractelement <8 x i8> %vec1, i64 6 5593 %cv2e2 = zext i8 %v2e2 to i32 5594 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 5595 5596 %v1e3 = extractelement <8 x i8> %vec1, i64 3 5597 %cv1e3 = zext i8 %v1e3 to i32 5598 %v2e3 = extractelement <8 x i8> %vec1, i64 7 5599 %cv2e3 = zext i8 %v2e3 to i32 5600 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 5601 5602 %add1 = add i32 %mul1, 0 5603 %add2 = add i32 %add1, %mul2 5604 %add3 = add i32 %add2, %mul3 5605 %add4 = add i32 %add3, %mul4 5606 store i32 %add4, ptr addrspace(1) %dst, align 4 5607 ret void 5608} 5609 5610define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, 5611; GFX7-LABEL: idot4_acc32_v16i8: 5612; GFX7: ; %bb.0: ; %entry 5613; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 5614; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 5615; GFX7-NEXT: s_mov_b32 s3, 0xf000 5616; GFX7-NEXT: s_mov_b32 s6, 0 5617; GFX7-NEXT: s_mov_b32 s7, s3 5618; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5619; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 5620; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v0 5621; GFX7-NEXT: v_mov_b32_e32 v2, 0 5622; GFX7-NEXT: s_mov_b64 s[8:9], s[10:11] 5623; GFX7-NEXT: s_mov_b64 s[10:11], s[6:7] 5624; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v0 5625; GFX7-NEXT: v_mov_b32_e32 v5, v2 5626; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[4:7], 0 addr64 5627; GFX7-NEXT: buffer_load_dword v0, v[4:5], s[8:11], 0 addr64 5628; GFX7-NEXT: s_mov_b32 s2, -1 5629; GFX7-NEXT: s_waitcnt vmcnt(1) 5630; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 5631; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 5632; GFX7-NEXT: s_waitcnt vmcnt(0) 5633; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 5634; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 5635; GFX7-NEXT: v_mul_u32_u24_e32 v2, v2, v5 5636; GFX7-NEXT: v_bfe_u32 v6, v3, 8, 8 5637; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 5638; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, v2 5639; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v3 5640; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 5641; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1 5642; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1 5643; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 5644; GFX7-NEXT: s_endpgm 5645; 5646; GFX8-LABEL: idot4_acc32_v16i8: 5647; GFX8: ; %bb.0: ; %entry 5648; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5649; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 5650; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v0 5651; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5652; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5653; GFX8-NEXT: v_mov_b32_e32 v2, s1 5654; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 5655; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 5656; GFX8-NEXT: v_mov_b32_e32 v3, s3 5657; GFX8-NEXT: v_add_u32_e32 v4, vcc, s2, v0 5658; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc 5659; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[1:2] 5660; GFX8-NEXT: flat_load_dword v4, v[4:5] 5661; GFX8-NEXT: s_waitcnt vmcnt(1) 5662; GFX8-NEXT: v_mov_b32_e32 v0, s4 5663; GFX8-NEXT: v_mov_b32_e32 v1, s5 5664; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2 5665; GFX8-NEXT: s_waitcnt vmcnt(0) 5666; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v4 5667; GFX8-NEXT: v_mul_u32_u24_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 5668; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 8 5669; GFX8-NEXT: v_bfe_u32 v8, v4, 16, 8 5670; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 5671; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 5672; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 5673; GFX8-NEXT: v_mad_u32_u24 v2, v7, v8, v2 5674; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2 5675; GFX8-NEXT: flat_store_dword v[0:1], v2 5676; GFX8-NEXT: s_endpgm 5677; 5678; GFX9-NODL-LABEL: idot4_acc32_v16i8: 5679; GFX9-NODL: ; %bb.0: ; %entry 5680; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5681; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5682; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 5683; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 5684; GFX9-NODL-NEXT: ; kill: killed $vgpr4 5685; GFX9-NODL-NEXT: ; kill: killed $vgpr5 5686; GFX9-NODL-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2 killed $sgpr3 5687; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 5688; GFX9-NODL-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] 5689; GFX9-NODL-NEXT: global_load_dword v0, v5, s[2:3] 5690; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 5691; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 5692; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2 5693; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 5694; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v0 5695; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 5696; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2 5697; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 5698; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 5699; GFX9-NODL-NEXT: v_add3_u32 v0, v2, v6, v0 5700; GFX9-NODL-NEXT: global_store_dword v1, v0, s[6:7] 5701; GFX9-NODL-NEXT: s_endpgm 5702; 5703; GFX9-DL-LABEL: idot4_acc32_v16i8: 5704; GFX9-DL: ; %bb.0: ; %entry 5705; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5706; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5707; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 5708; GFX9-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 5709; GFX9-DL-NEXT: ; kill: killed $vgpr4 5710; GFX9-DL-NEXT: ; kill: killed $vgpr5 5711; GFX9-DL-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2 killed $sgpr3 5712; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 5713; GFX9-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] 5714; GFX9-DL-NEXT: global_load_dword v0, v5, s[2:3] 5715; GFX9-DL-NEXT: s_mov_b32 s0, 0x7050002 5716; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001 5717; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 5718; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 5719; GFX9-DL-NEXT: v_perm_b32 v2, v3, v2, s0 5720; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 5721; GFX9-DL-NEXT: v_perm_b32 v0, v0, v0, s1 5722; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, 0 5723; GFX9-DL-NEXT: global_store_dword v1, v0, s[6:7] 5724; GFX9-DL-NEXT: s_endpgm 5725; 5726; GFX10-DL-LABEL: idot4_acc32_v16i8: 5727; GFX10-DL: ; %bb.0: ; %entry 5728; GFX10-DL-NEXT: s_clause 0x1 5729; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5730; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5731; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 5732; GFX10-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 5733; GFX10-DL-NEXT: ; kill: killed $vgpr4 5734; GFX10-DL-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2 killed $sgpr3 5735; GFX10-DL-NEXT: ; kill: killed $vgpr5 5736; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 5737; GFX10-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] 5738; GFX10-DL-NEXT: global_load_dword v0, v5, s[2:3] 5739; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 5740; GFX10-DL-NEXT: v_perm_b32 v1, v3, v2, 0x7050002 5741; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 5742; GFX10-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001 5743; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 5744; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 5745; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 5746; GFX10-DL-NEXT: s_endpgm 5747; 5748; GFX11-DL-LABEL: idot4_acc32_v16i8: 5749; GFX11-DL: ; %bb.0: ; %entry 5750; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5751; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 5752; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 5753; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 5754; GFX11-DL-NEXT: v_lshlrev_b32_e32 v1, 4, v0 5755; GFX11-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 5756; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 5757; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[0:1] 5758; GFX11-DL-NEXT: global_load_b32 v0, v4, s[2:3] 5759; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 5760; GFX11-DL-NEXT: v_perm_b32 v1, v3, v2, 0x7050002 5761; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 5762; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 5763; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001 5764; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 5765; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 5766; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 5767; GFX11-DL-NEXT: s_endpgm 5768 ptr addrspace(1) %src2, 5769 ptr addrspace(1) nocapture %dst) { 5770entry: 5771 %idx = call i32 @llvm.amdgcn.workitem.id.x() 5772 %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx 5773 %vec1 = load <16 x i8>, ptr addrspace(1) %gep1 5774 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx 5775 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 5776 5777 %v1e0 = extractelement <16 x i8> %vec1, i64 8 5778 %cv1e0 = zext i8 %v1e0 to i32 5779 %v2e0 = extractelement <8 x i8> %vec2, i64 0 5780 %cv2e0 = zext i8 %v2e0 to i32 5781 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 5782 5783 %v1e1 = extractelement <16 x i8> %vec1, i64 10 5784 %cv1e1 = zext i8 %v1e1 to i32 5785 %v2e1 = extractelement <8 x i8> %vec2, i64 1 5786 %cv2e1 = zext i8 %v2e1 to i32 5787 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 5788 5789 %v1e2 = extractelement <16 x i8> %vec1, i64 13 5790 %cv1e2 = zext i8 %v1e2 to i32 5791 %v2e2 = extractelement <8 x i8> %vec2, i64 2 5792 %cv2e2 = zext i8 %v2e2 to i32 5793 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 5794 5795 %v1e3 = extractelement <16 x i8> %vec1, i64 15 5796 %cv1e3 = zext i8 %v1e3 to i32 5797 %v2e3 = extractelement <8 x i8> %vec2, i64 3 5798 %cv2e3 = zext i8 %v2e3 to i32 5799 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 5800 5801 %add1 = add i32 %mul1, 0 5802 %add2 = add i32 %add1, %mul2 5803 %add3 = add i32 %add2, %mul3 5804 %add4 = add i32 %add3, %mul4 5805 store i32 %add4, ptr addrspace(1) %dst, align 4 5806 ret void 5807} 5808 5809define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, 5810; GFX7-LABEL: idot4_acc32_v256i8: 5811; GFX7: ; %bb.0: ; %entry 5812; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 5813; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 5814; GFX7-NEXT: s_mov_b32 s3, 0xf000 5815; GFX7-NEXT: s_mov_b32 s6, 0 5816; GFX7-NEXT: s_mov_b32 s7, s3 5817; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0 5818; GFX7-NEXT: v_mov_b32_e32 v2, 0 5819; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5820; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 5821; GFX7-NEXT: s_mov_b64 s[10:11], s[6:7] 5822; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v0 5823; GFX7-NEXT: v_mov_b32_e32 v4, v2 5824; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[8:11], 0 addr64 offset:252 5825; GFX7-NEXT: buffer_load_dword v1, v[3:4], s[4:7], 0 addr64 5826; GFX7-NEXT: s_mov_b32 s2, -1 5827; GFX7-NEXT: s_waitcnt vmcnt(1) 5828; GFX7-NEXT: v_bfe_u32 v4, v0, 16, 8 5829; GFX7-NEXT: s_waitcnt vmcnt(0) 5830; GFX7-NEXT: v_bfe_u32 v5, v1, 8, 8 5831; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 5832; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v1 5833; GFX7-NEXT: v_mul_u32_u24_e32 v4, v4, v5 5834; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 5835; GFX7-NEXT: v_bfe_u32 v7, v1, 16, 8 5836; GFX7-NEXT: v_mad_u32_u24 v2, v2, v3, v4 5837; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 5838; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 5839; GFX7-NEXT: v_mad_u32_u24 v2, v6, v7, v2 5840; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, v2 5841; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 5842; GFX7-NEXT: s_endpgm 5843; 5844; GFX8-LABEL: idot4_acc32_v256i8: 5845; GFX8: ; %bb.0: ; %entry 5846; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5847; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 5848; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v0 5849; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5850; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5851; GFX8-NEXT: v_mov_b32_e32 v2, s1 5852; GFX8-NEXT: v_add_u32_e32 v3, vcc, s0, v1 5853; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 5854; GFX8-NEXT: v_mov_b32_e32 v1, s3 5855; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 5856; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5857; GFX8-NEXT: s_movk_i32 s0, 0xfc 5858; GFX8-NEXT: flat_load_dword v4, v[0:1] 5859; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3 5860; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 5861; GFX8-NEXT: flat_load_dword v2, v[0:1] 5862; GFX8-NEXT: v_mov_b32_e32 v0, s4 5863; GFX8-NEXT: v_mov_b32_e32 v1, s5 5864; GFX8-NEXT: s_waitcnt vmcnt(1) 5865; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 5866; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8 5867; GFX8-NEXT: s_waitcnt vmcnt(0) 5868; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 5869; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 5870; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v2 5871; GFX8-NEXT: v_mad_u32_u24 v3, v6, v3, v7 5872; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8 5873; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 5874; GFX8-NEXT: v_mad_u32_u24 v3, v8, v5, v3 5875; GFX8-NEXT: v_mad_u32_u24 v2, v2, v4, v3 5876; GFX8-NEXT: flat_store_dword v[0:1], v2 5877; GFX8-NEXT: s_endpgm 5878; 5879; GFX9-NODL-LABEL: idot4_acc32_v256i8: 5880; GFX9-NODL: ; %bb.0: ; %entry 5881; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5882; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5883; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v1, 8, v0 5884; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5885; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 5886; GFX9-NODL-NEXT: global_load_dword v2, v1, s[0:1] offset:252 5887; GFX9-NODL-NEXT: global_load_dword v3, v0, s[2:3] 5888; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 5889; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 5890; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v2 5891; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 5892; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v3 5893; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 5894; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 5895; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3 5896; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v4, v5 5897; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v6, v2 5898; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 5899; GFX9-NODL-NEXT: s_endpgm 5900; 5901; GFX9-DL-LABEL: idot4_acc32_v256i8: 5902; GFX9-DL: ; %bb.0: ; %entry 5903; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5904; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5905; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 8, v0 5906; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 5907; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 5908; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 5909; GFX9-DL-NEXT: global_load_dword v3, v1, s[0:1] offset:252 5910; GFX9-DL-NEXT: s_mov_b32 s0, 0x3020001 5911; GFX9-DL-NEXT: s_mov_b32 s1, 0x1000302 5912; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 5913; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 5914; GFX9-DL-NEXT: v_perm_b32 v1, v2, v2, s0 5915; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 5916; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s1 5917; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 5918; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 5919; GFX9-DL-NEXT: s_endpgm 5920; 5921; GFX10-DL-LABEL: idot4_acc32_v256i8: 5922; GFX10-DL: ; %bb.0: ; %entry 5923; GFX10-DL-NEXT: s_clause 0x1 5924; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5925; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5926; GFX10-DL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 5927; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 5928; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 5929; GFX10-DL-NEXT: global_load_dword v2, v1, s[2:3] 5930; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] offset:252 5931; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 5932; GFX10-DL-NEXT: v_perm_b32 v0, v2, v2, 0x3020001 5933; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 5934; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0x1000302 5935; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 5936; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 5937; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 5938; GFX10-DL-NEXT: s_endpgm 5939; 5940; GFX11-DL-LABEL: idot4_acc32_v256i8: 5941; GFX11-DL: ; %bb.0: ; %entry 5942; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 5943; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 5944; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 5945; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 5946; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0 5947; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 5948; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 5949; GFX11-DL-NEXT: global_load_b32 v1, v1, s[2:3] 5950; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] offset:252 5951; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 5952; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x3020001 5953; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 5954; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x1000302 5955; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 5956; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 5957; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 5958; GFX11-DL-NEXT: s_endpgm 5959 ptr addrspace(1) %src2, 5960 ptr addrspace(1) nocapture %dst) { 5961entry: 5962 %idx = call i32 @llvm.amdgcn.workitem.id.x() 5963 %gep1 = getelementptr <256 x i8>, ptr addrspace(1) %src1, i32 %idx 5964 %vec1 = load <256 x i8>, ptr addrspace(1) %gep1 5965 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx 5966 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 5967 5968 %v1e0 = extractelement <256 x i8> %vec1, i64 255 5969 %cv1e0 = zext i8 %v1e0 to i32 5970 %v2e0 = extractelement <8 x i8> %vec2, i64 0 5971 %cv2e0 = zext i8 %v2e0 to i32 5972 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 5973 5974 %v1e1 = extractelement <256 x i8> %vec1, i64 254 5975 %cv1e1 = zext i8 %v1e1 to i32 5976 %v2e1 = extractelement <8 x i8> %vec2, i64 1 5977 %cv2e1 = zext i8 %v2e1 to i32 5978 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 5979 5980 %v1e2 = extractelement <256 x i8> %vec1, i64 252 5981 %cv1e2 = zext i8 %v1e2 to i32 5982 %v2e2 = extractelement <8 x i8> %vec2, i64 2 5983 %cv2e2 = zext i8 %v2e2 to i32 5984 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 5985 5986 %v1e3 = extractelement <256 x i8> %vec1, i64 253 5987 %cv1e3 = zext i8 %v1e3 to i32 5988 %v2e3 = extractelement <8 x i8> %vec2, i64 3 5989 %cv2e3 = zext i8 %v2e3 to i32 5990 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 5991 5992 %add1 = add i32 %mul1, 0 5993 %add2 = add i32 %add1, %mul2 5994 %add3 = add i32 %add2, %mul3 5995 %add4 = add i32 %add3, %mul4 5996 store i32 %add4, ptr addrspace(1) %dst, align 4 5997 ret void 5998} 5999 6000define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, 6001; GFX7-LABEL: idot4_acc32_anyext: 6002; GFX7: ; %bb.0: ; %entry 6003; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6004; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 6005; GFX7-NEXT: s_mov_b32 s7, 0xf000 6006; GFX7-NEXT: s_mov_b32 s10, 0 6007; GFX7-NEXT: s_mov_b32 s11, s7 6008; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6009; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 6010; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 6011; GFX7-NEXT: v_mov_b32_e32 v1, 0 6012; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 6013; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 6014; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 6015; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 6016; GFX7-NEXT: s_mov_b32 s6, -1 6017; GFX7-NEXT: s_waitcnt vmcnt(1) 6018; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 6019; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 6020; GFX7-NEXT: s_waitcnt vmcnt(0) 6021; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 6022; GFX7-NEXT: s_waitcnt lgkmcnt(0) 6023; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s0 6024; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 6025; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 6026; GFX7-NEXT: s_endpgm 6027; 6028; GFX8-LABEL: idot4_acc32_anyext: 6029; GFX8: ; %bb.0: ; %entry 6030; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6031; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 6032; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 6033; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6034; GFX8-NEXT: v_mov_b32_e32 v1, s1 6035; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 6036; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6037; GFX8-NEXT: flat_load_dword v3, v[0:1] 6038; GFX8-NEXT: v_mov_b32_e32 v1, s3 6039; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 6040; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6041; GFX8-NEXT: flat_load_dword v0, v[0:1] 6042; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 6043; GFX8-NEXT: s_waitcnt vmcnt(1) 6044; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 6045; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8 6046; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6047; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s0 6048; GFX8-NEXT: s_waitcnt vmcnt(0) 6049; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8 6050; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 6051; GFX8-NEXT: v_mov_b32_e32 v0, s4 6052; GFX8-NEXT: v_mov_b32_e32 v1, s5 6053; GFX8-NEXT: flat_store_dword v[0:1], v2 6054; GFX8-NEXT: s_endpgm 6055; 6056; GFX9-NODL-LABEL: idot4_acc32_anyext: 6057; GFX9-NODL: ; %bb.0: ; %entry 6058; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6059; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6060; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 6061; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 6062; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 6063; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 6064; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 6065; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 6066; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 6067; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 6068; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 6069; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 6070; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 6071; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 6072; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 6073; GFX9-NODL-NEXT: s_endpgm 6074; 6075; GFX9-DL-LABEL: idot4_acc32_anyext: 6076; GFX9-DL: ; %bb.0: ; %entry 6077; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6078; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6079; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 6080; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 6081; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 6082; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 6083; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 6084; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0500 6085; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0100 6086; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 6087; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 6088; GFX9-DL-NEXT: v_perm_b32 v2, v2, v1, s1 6089; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 6090; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 6091; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 6092; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 6093; GFX9-DL-NEXT: s_endpgm 6094; 6095; GFX10-DL-LABEL: idot4_acc32_anyext: 6096; GFX10-DL: ; %bb.0: ; %entry 6097; GFX10-DL-NEXT: s_clause 0x1 6098; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6099; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 6100; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 6101; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 6102; GFX10-DL-NEXT: s_clause 0x1 6103; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 6104; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 6105; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 6106; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 6107; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 6108; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0500 6109; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 6110; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 6111; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 6112; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 6113; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 6114; GFX10-DL-NEXT: s_endpgm 6115; 6116; GFX11-DL-LABEL: idot4_acc32_anyext: 6117; GFX11-DL: ; %bb.0: ; %entry 6118; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 6119; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 6120; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 6121; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 6122; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 6123; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 6124; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 6125; GFX11-DL-NEXT: s_clause 0x1 6126; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 6127; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 6128; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 6129; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 6130; GFX11-DL-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0500 6131; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 6132; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 6133; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 6134; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 6135; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 6136; GFX11-DL-NEXT: s_endpgm 6137 ptr addrspace(1) %src2, 6138 ptr addrspace(1) nocapture %dst) { 6139entry: 6140 %idx = call i32 @llvm.amdgcn.workitem.id.x() 6141 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 6142 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 6143 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 6144 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 6145 6146 %v1e0 = extractelement <4 x i8> %vec1, i64 0 6147 %cv1e0t = sext i8 %v1e0 to i32 6148 %cv1e0 = and i32 %cv1e0t, 255 6149 %v2e0 = extractelement <4 x i8> %vec2, i64 0 6150 %cv2e0t = sext i8 %v2e0 to i32 6151 %cv2e0 = and i32 %cv1e0t, 255 6152 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 6153 6154 %v1e1 = extractelement <4 x i8> %vec1, i64 1 6155 %cv1e1 = zext i8 %v1e1 to i32 6156 %v2e1 = extractelement <4 x i8> %vec2, i64 1 6157 %cv2e1t = sext i8 %v2e1 to i32 6158 %cv2e1 = and i32 %cv2e1t, 255 6159 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 6160 6161 %acc = load i32, ptr addrspace(1) %dst, align 4 6162 %add1 = add i32 %mul1, %acc 6163 %add2 = add i32 %add1, %mul2 6164 store i32 %add2, ptr addrspace(1) %dst, align 4 6165 ret void 6166} 6167 6168declare i32 @llvm.amdgcn.workitem.id.x() 6169