1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL %s 8 9define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, 10; GFX7-LABEL: udot8_acc32: 11; GFX7: ; %bb.0: ; %entry 12; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 13; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 14; GFX7-NEXT: s_mov_b32 s14, -1 15; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 16; GFX7-NEXT: s_add_u32 s12, s12, s11 17; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 18; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 19; GFX7-NEXT: s_mov_b32 s3, 0xf000 20; GFX7-NEXT: s_mov_b32 s6, 0 21; GFX7-NEXT: s_mov_b32 s7, s3 22; GFX7-NEXT: s_waitcnt lgkmcnt(0) 23; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 24; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 25; GFX7-NEXT: v_mov_b32_e32 v1, 0 26; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 27; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 28; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 29; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 30; GFX7-NEXT: s_mov_b32 s2, -1 31; GFX7-NEXT: s_addc_u32 s13, s13, 0 32; GFX7-NEXT: s_waitcnt vmcnt(1) 33; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 34; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 35; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 36; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 37; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 38; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 39; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 40; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 41; GFX7-NEXT: s_waitcnt vmcnt(0) 42; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 43; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 44; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 45; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 46; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 47; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 48; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 49; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 50; GFX7-NEXT: s_waitcnt lgkmcnt(0) 51; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4 52; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 53; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 54; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 55; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 56; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 57; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 58; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0 59; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 60; GFX7-NEXT: s_endpgm 61; 62; GFX8-LABEL: udot8_acc32: 63; GFX8: ; %bb.0: ; %entry 64; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 65; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 66; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 67; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 68; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 69; GFX8-NEXT: s_waitcnt lgkmcnt(0) 70; GFX8-NEXT: v_mov_b32_e32 v1, s1 71; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 72; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 73; GFX8-NEXT: flat_load_dword v3, v[0:1] 74; GFX8-NEXT: v_mov_b32_e32 v1, s3 75; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 76; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 77; GFX8-NEXT: flat_load_dword v0, v[0:1] 78; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 79; GFX8-NEXT: s_mov_b32 s14, -1 80; GFX8-NEXT: s_mov_b32 s15, 0xe80000 81; GFX8-NEXT: s_add_u32 s12, s12, s11 82; GFX8-NEXT: s_addc_u32 s13, s13, 0 83; GFX8-NEXT: s_waitcnt vmcnt(1) 84; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 85; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 86; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4 87; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4 88; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4 89; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 90; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4 91; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 92; GFX8-NEXT: s_waitcnt vmcnt(0) 93; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0 94; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4 95; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4 96; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4 97; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4 98; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4 99; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4 100; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 101; GFX8-NEXT: s_waitcnt lgkmcnt(0) 102; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0 103; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0 104; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0 105; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0 106; GFX8-NEXT: v_mad_u32_u24 v0, v5, v12, v0 107; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0 108; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0 109; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0 110; GFX8-NEXT: v_mov_b32_e32 v0, s4 111; GFX8-NEXT: v_mov_b32_e32 v1, s5 112; GFX8-NEXT: flat_store_dword v[0:1], v2 113; GFX8-NEXT: s_endpgm 114; 115; GFX9-LABEL: udot8_acc32: 116; GFX9: ; %bb.0: ; %entry 117; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 118; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 119; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 120; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 121; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 122; GFX9-NEXT: s_waitcnt lgkmcnt(0) 123; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 124; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 125; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 126; GFX9-NEXT: s_mov_b32 s14, -1 127; GFX9-NEXT: s_mov_b32 s15, 0xe00000 128; GFX9-NEXT: s_add_u32 s12, s12, s11 129; GFX9-NEXT: v_mov_b32_e32 v0, 0 130; GFX9-NEXT: s_addc_u32 s13, s13, 0 131; GFX9-NEXT: s_waitcnt vmcnt(1) 132; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 133; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4 134; GFX9-NEXT: v_bfe_u32 v5, v1, 20, 4 135; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 4 136; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 137; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 138; GFX9-NEXT: v_bfe_u32 v9, v1, 4, 4 139; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 140; GFX9-NEXT: s_waitcnt vmcnt(0) 141; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 142; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4 143; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 144; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4 145; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 146; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 147; GFX9-NEXT: v_bfe_u32 v16, v2, 4, 4 148; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 149; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2 150; GFX9-NEXT: v_mul_u32_u24_e32 v2, v9, v16 151; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 152; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 153; GFX9-NEXT: s_waitcnt lgkmcnt(0) 154; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 155; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 156; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 157; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 158; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11 159; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10 160; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 161; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 162; GFX9-NEXT: global_store_dword v0, v1, s[6:7] 163; GFX9-NEXT: s_endpgm 164; 165; GFX9-DL-LABEL: udot8_acc32: 166; GFX9-DL: ; %bb.0: ; %entry 167; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 168; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 169; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 170; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 171; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 172; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 173; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 174; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 175; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 176; GFX9-DL-NEXT: s_mov_b32 s14, -1 177; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 178; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 179; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 180; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 181; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 182; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 183; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 184; GFX9-DL-NEXT: s_endpgm 185; 186; GFX10-DL-LABEL: udot8_acc32: 187; GFX10-DL: ; %bb.0: ; %entry 188; GFX10-DL-NEXT: s_clause 0x1 189; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 190; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 191; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 192; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 193; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 194; GFX10-DL-NEXT: s_mov_b32 s14, -1 195; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 196; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 197; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 198; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 199; GFX10-DL-NEXT: s_clause 0x1 200; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 201; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 202; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 203; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 204; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 205; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 206; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 207; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] 208; GFX10-DL-NEXT: s_endpgm 209 ptr addrspace(1) %src2, 210 ptr addrspace(1) nocapture %dst) { 211entry: 212 %idx = call i32 @llvm.amdgcn.workitem.id.x() 213 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 214 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 215 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 216 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 217 218 %v1e0 = extractelement <8 x i4> %vec1, i64 0 219 %cv1e0 = zext i4 %v1e0 to i32 220 %v2e0 = extractelement <8 x i4> %vec2, i64 0 221 %cv2e0 = zext i4 %v2e0 to i32 222 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 223 224 %v1e1 = extractelement <8 x i4> %vec1, i64 1 225 %cv1e1 = zext i4 %v1e1 to i32 226 %v2e1 = extractelement <8 x i4> %vec2, i64 1 227 %cv2e1 = zext i4 %v2e1 to i32 228 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 229 230 %v1e2 = extractelement <8 x i4> %vec1, i64 2 231 %cv1e2 = zext i4 %v1e2 to i32 232 %v2e2 = extractelement <8 x i4> %vec2, i64 2 233 %cv2e2 = zext i4 %v2e2 to i32 234 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 235 236 %v1e3 = extractelement <8 x i4> %vec1, i64 3 237 %cv1e3 = zext i4 %v1e3 to i32 238 %v2e3 = extractelement <8 x i4> %vec2, i64 3 239 %cv2e3 = zext i4 %v2e3 to i32 240 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 241 242 %v1e4 = extractelement <8 x i4> %vec1, i64 4 243 %cv1e4 = zext i4 %v1e4 to i32 244 %v2e4 = extractelement <8 x i4> %vec2, i64 4 245 %cv2e4 = zext i4 %v2e4 to i32 246 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 247 248 %v1e5 = extractelement <8 x i4> %vec1, i64 5 249 %cv1e5 = zext i4 %v1e5 to i32 250 %v2e5 = extractelement <8 x i4> %vec2, i64 5 251 %cv2e5 = zext i4 %v2e5 to i32 252 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 253 254 %v1e6 = extractelement <8 x i4> %vec1, i64 6 255 %cv1e6 = zext i4 %v1e6 to i32 256 %v2e6 = extractelement <8 x i4> %vec2, i64 6 257 %cv2e6 = zext i4 %v2e6 to i32 258 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 259 260 %v1e7 = extractelement <8 x i4> %vec1, i64 7 261 %cv1e7 = zext i4 %v1e7 to i32 262 %v2e7 = extractelement <8 x i4> %vec2, i64 7 263 %cv2e7 = zext i4 %v2e7 to i32 264 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 265 266 %acc = load i32, ptr addrspace(1) %dst, align 4 267 %add1 = add i32 %mul0, %acc 268 %add2 = add i32 %add1, %mul1 269 %add3 = add i32 %add2, %mul2 270 %add4 = add i32 %add3, %mul3 271 %add5 = add i32 %add4, %mul4 272 %add6 = add i32 %add5, %mul5 273 %add7 = add i32 %add6, %mul6 274 %add8 = add i32 %add7, %mul7 275 276 store i32 %add8, ptr addrspace(1) %dst, align 4 277 ret void 278} 279 280; TODO: Remove the unnecessary instruction(that is zero-extending the 281; 2nd MAD) to have the pattern-recognizer to kick in. 282define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, 283; GFX7-LABEL: udot8_acc16: 284; GFX7: ; %bb.0: ; %entry 285; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 286; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 287; GFX7-NEXT: s_mov_b32 s14, -1 288; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 289; GFX7-NEXT: s_add_u32 s12, s12, s11 290; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 291; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 292; GFX7-NEXT: s_mov_b32 s3, 0xf000 293; GFX7-NEXT: s_mov_b32 s6, 0 294; GFX7-NEXT: s_mov_b32 s7, s3 295; GFX7-NEXT: s_waitcnt lgkmcnt(0) 296; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 297; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 298; GFX7-NEXT: v_mov_b32_e32 v1, 0 299; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 300; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 301; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 302; GFX7-NEXT: s_mov_b32 s2, -1 303; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 304; GFX7-NEXT: s_addc_u32 s13, s13, 0 305; GFX7-NEXT: s_waitcnt vmcnt(2) 306; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 307; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 308; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 309; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 310; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 311; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 312; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 313; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 314; GFX7-NEXT: s_waitcnt vmcnt(1) 315; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 316; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 317; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 318; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 319; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 320; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 321; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 322; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 323; GFX7-NEXT: s_waitcnt vmcnt(0) 324; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 325; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 326; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 327; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 328; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 329; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 330; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 331; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 332; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 333; GFX7-NEXT: s_endpgm 334; 335; GFX8-LABEL: udot8_acc16: 336; GFX8: ; %bb.0: ; %entry 337; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 338; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 339; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 340; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 341; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 342; GFX8-NEXT: s_waitcnt lgkmcnt(0) 343; GFX8-NEXT: v_mov_b32_e32 v1, s1 344; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 345; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 346; GFX8-NEXT: flat_load_dword v3, v[0:1] 347; GFX8-NEXT: v_mov_b32_e32 v1, s3 348; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 349; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 350; GFX8-NEXT: flat_load_dword v2, v[0:1] 351; GFX8-NEXT: v_mov_b32_e32 v0, s4 352; GFX8-NEXT: v_mov_b32_e32 v1, s5 353; GFX8-NEXT: flat_load_ushort v4, v[0:1] 354; GFX8-NEXT: s_mov_b32 s14, -1 355; GFX8-NEXT: s_mov_b32 s15, 0xe80000 356; GFX8-NEXT: s_add_u32 s12, s12, s11 357; GFX8-NEXT: s_addc_u32 s13, s13, 0 358; GFX8-NEXT: s_waitcnt vmcnt(2) 359; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 360; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 361; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 362; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 363; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 364; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 365; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 366; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 367; GFX8-NEXT: s_waitcnt vmcnt(1) 368; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 369; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4 370; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 371; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 372; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 373; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 374; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 375; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 376; GFX8-NEXT: s_waitcnt vmcnt(0) 377; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 378; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 379; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 380; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 381; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 382; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 383; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 384; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 385; GFX8-NEXT: flat_store_short v[0:1], v2 386; GFX8-NEXT: s_endpgm 387; 388; GFX9-LABEL: udot8_acc16: 389; GFX9: ; %bb.0: ; %entry 390; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 391; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 392; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 393; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 394; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 395; GFX9-NEXT: s_waitcnt lgkmcnt(0) 396; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 397; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 398; GFX9-NEXT: v_mov_b32_e32 v0, 0 399; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] 400; GFX9-NEXT: s_mov_b32 s14, -1 401; GFX9-NEXT: s_mov_b32 s15, 0xe00000 402; GFX9-NEXT: s_add_u32 s12, s12, s11 403; GFX9-NEXT: s_addc_u32 s13, s13, 0 404; GFX9-NEXT: s_waitcnt vmcnt(2) 405; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 406; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 407; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 408; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 409; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 410; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 411; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 412; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 413; GFX9-NEXT: s_waitcnt vmcnt(1) 414; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 415; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 416; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 417; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 418; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 419; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 420; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 421; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 422; GFX9-NEXT: s_waitcnt vmcnt(0) 423; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 424; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 425; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 426; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 427; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 428; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 429; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 430; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 431; GFX9-NEXT: global_store_short v0, v1, s[6:7] 432; GFX9-NEXT: s_endpgm 433; 434; GFX9-DL-LABEL: udot8_acc16: 435; GFX9-DL: ; %bb.0: ; %entry 436; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 437; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 438; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 439; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 440; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 441; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 442; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 443; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 444; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 445; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] 446; GFX9-DL-NEXT: s_mov_b32 s14, -1 447; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 448; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 449; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 450; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 451; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 452; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 453; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 454; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 455; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 456; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 457; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 458; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 459; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 460; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 461; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 462; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 463; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 464; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 465; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 466; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 467; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 468; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 469; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 470; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 471; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 472; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 473; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 474; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 475; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 476; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 477; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] 478; GFX9-DL-NEXT: s_endpgm 479; 480; GFX10-DL-LABEL: udot8_acc16: 481; GFX10-DL: ; %bb.0: ; %entry 482; GFX10-DL-NEXT: s_clause 0x1 483; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 484; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 485; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 486; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 487; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 488; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 489; GFX10-DL-NEXT: s_mov_b32 s14, -1 490; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 491; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 492; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 493; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 494; GFX10-DL-NEXT: s_clause 0x1 495; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 496; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] 497; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7] 498; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 499; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 500; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 501; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 502; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 503; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 504; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 505; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 506; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 507; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 508; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 509; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 510; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 511; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 512; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 513; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 514; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 515; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 516; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 517; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 518; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 519; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 520; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 521; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 522; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 523; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 524; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 525; GFX10-DL-NEXT: global_store_short v1, v0, s[6:7] 526; GFX10-DL-NEXT: s_endpgm 527 ptr addrspace(1) %src2, 528 ptr addrspace(1) nocapture %dst) { 529entry: 530 %idx = call i32 @llvm.amdgcn.workitem.id.x() 531 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 532 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 533 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 534 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 535 536 %v1e0 = extractelement <8 x i4> %vec1, i64 0 537 %cv1e0 = zext i4 %v1e0 to i16 538 %v2e0 = extractelement <8 x i4> %vec2, i64 0 539 %cv2e0 = zext i4 %v2e0 to i16 540 %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0 541 542 %v1e1 = extractelement <8 x i4> %vec1, i64 1 543 %cv1e1 = zext i4 %v1e1 to i16 544 %v2e1 = extractelement <8 x i4> %vec2, i64 1 545 %cv2e1 = zext i4 %v2e1 to i16 546 %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1 547 548 %v1e2 = extractelement <8 x i4> %vec1, i64 2 549 %cv1e2 = zext i4 %v1e2 to i16 550 %v2e2 = extractelement <8 x i4> %vec2, i64 2 551 %cv2e2 = zext i4 %v2e2 to i16 552 %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2 553 554 %v1e3 = extractelement <8 x i4> %vec1, i64 3 555 %cv1e3 = zext i4 %v1e3 to i16 556 %v2e3 = extractelement <8 x i4> %vec2, i64 3 557 %cv2e3 = zext i4 %v2e3 to i16 558 %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3 559 560 %v1e4 = extractelement <8 x i4> %vec1, i64 4 561 %cv1e4 = zext i4 %v1e4 to i16 562 %v2e4 = extractelement <8 x i4> %vec2, i64 4 563 %cv2e4 = zext i4 %v2e4 to i16 564 %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4 565 566 %v1e5 = extractelement <8 x i4> %vec1, i64 5 567 %cv1e5 = zext i4 %v1e5 to i16 568 %v2e5 = extractelement <8 x i4> %vec2, i64 5 569 %cv2e5 = zext i4 %v2e5 to i16 570 %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5 571 572 %v1e6 = extractelement <8 x i4> %vec1, i64 6 573 %cv1e6 = zext i4 %v1e6 to i16 574 %v2e6 = extractelement <8 x i4> %vec2, i64 6 575 %cv2e6 = zext i4 %v2e6 to i16 576 %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6 577 578 %v1e7 = extractelement <8 x i4> %vec1, i64 7 579 %cv1e7 = zext i4 %v1e7 to i16 580 %v2e7 = extractelement <8 x i4> %vec2, i64 7 581 %cv2e7 = zext i4 %v2e7 to i16 582 %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7 583 584 %acc = load i16, ptr addrspace(1) %dst, align 4 585 %add1 = add i16 %mul0, %acc 586 %add2 = add i16 %add1, %mul1 587 %add3 = add i16 %add2, %mul2 588 %add4 = add i16 %add3, %mul3 589 %add5 = add i16 %add4, %mul4 590 %add6 = add i16 %add5, %mul5 591 %add7 = add i16 %add6, %mul6 592 %add8 = add i16 %add7, %mul7 593 594 store i16 %add8, ptr addrspace(1) %dst, align 4 595 ret void 596} 597 598; TODO: Remove the unnecessary instruction(that is zero-extending the 599; 2nd MAD) to have the pattern-recognizer to kick in. 600define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, 601; GFX7-LABEL: udot8_acc8: 602; GFX7: ; %bb.0: ; %entry 603; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 604; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 605; GFX7-NEXT: s_mov_b32 s14, -1 606; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 607; GFX7-NEXT: s_add_u32 s12, s12, s11 608; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 609; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 610; GFX7-NEXT: s_mov_b32 s3, 0xf000 611; GFX7-NEXT: s_mov_b32 s6, 0 612; GFX7-NEXT: s_mov_b32 s7, s3 613; GFX7-NEXT: s_waitcnt lgkmcnt(0) 614; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 615; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 616; GFX7-NEXT: v_mov_b32_e32 v1, 0 617; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 618; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 619; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 620; GFX7-NEXT: s_mov_b32 s2, -1 621; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 622; GFX7-NEXT: s_addc_u32 s13, s13, 0 623; GFX7-NEXT: s_waitcnt vmcnt(2) 624; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 625; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 626; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 627; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 628; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 629; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 630; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 631; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 632; GFX7-NEXT: s_waitcnt vmcnt(1) 633; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 634; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 635; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 636; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 637; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 638; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 639; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 640; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 641; GFX7-NEXT: s_waitcnt vmcnt(0) 642; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 643; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 644; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 645; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 646; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 647; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 648; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 649; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 650; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 651; GFX7-NEXT: s_endpgm 652; 653; GFX8-LABEL: udot8_acc8: 654; GFX8: ; %bb.0: ; %entry 655; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 656; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 657; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 658; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 659; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 660; GFX8-NEXT: s_waitcnt lgkmcnt(0) 661; GFX8-NEXT: v_mov_b32_e32 v1, s1 662; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 663; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 664; GFX8-NEXT: flat_load_dword v3, v[0:1] 665; GFX8-NEXT: v_mov_b32_e32 v1, s3 666; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 667; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 668; GFX8-NEXT: flat_load_dword v2, v[0:1] 669; GFX8-NEXT: v_mov_b32_e32 v0, s4 670; GFX8-NEXT: v_mov_b32_e32 v1, s5 671; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 672; GFX8-NEXT: s_mov_b32 s14, -1 673; GFX8-NEXT: s_mov_b32 s15, 0xe80000 674; GFX8-NEXT: s_add_u32 s12, s12, s11 675; GFX8-NEXT: s_addc_u32 s13, s13, 0 676; GFX8-NEXT: s_waitcnt vmcnt(2) 677; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 678; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 679; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 680; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 681; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 682; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 683; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 684; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 685; GFX8-NEXT: s_waitcnt vmcnt(1) 686; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 687; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4 688; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 689; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 690; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 691; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 692; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 693; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 694; GFX8-NEXT: s_waitcnt vmcnt(0) 695; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 696; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 697; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 698; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 699; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 700; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 701; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 702; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 703; GFX8-NEXT: flat_store_byte v[0:1], v2 704; GFX8-NEXT: s_endpgm 705; 706; GFX9-LABEL: udot8_acc8: 707; GFX9: ; %bb.0: ; %entry 708; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 709; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 710; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 711; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 712; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 713; GFX9-NEXT: s_waitcnt lgkmcnt(0) 714; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 715; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 716; GFX9-NEXT: v_mov_b32_e32 v0, 0 717; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] 718; GFX9-NEXT: s_mov_b32 s14, -1 719; GFX9-NEXT: s_mov_b32 s15, 0xe00000 720; GFX9-NEXT: s_add_u32 s12, s12, s11 721; GFX9-NEXT: s_addc_u32 s13, s13, 0 722; GFX9-NEXT: s_waitcnt vmcnt(2) 723; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 724; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 725; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 726; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 727; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 728; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 729; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 730; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 731; GFX9-NEXT: s_waitcnt vmcnt(1) 732; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 733; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 734; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 735; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 736; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 737; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 738; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 739; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 740; GFX9-NEXT: s_waitcnt vmcnt(0) 741; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 742; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 743; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 744; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 745; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 746; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 747; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 748; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 749; GFX9-NEXT: global_store_byte v0, v1, s[6:7] 750; GFX9-NEXT: s_endpgm 751; 752; GFX9-DL-LABEL: udot8_acc8: 753; GFX9-DL: ; %bb.0: ; %entry 754; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 755; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 756; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 757; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 758; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 759; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 760; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 761; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 762; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 763; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] 764; GFX9-DL-NEXT: s_mov_b32 s14, -1 765; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 766; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 767; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 768; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 769; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 770; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 771; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 772; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 773; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 774; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 775; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 776; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 777; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 778; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 779; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 780; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 781; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 782; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 783; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 784; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 785; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 786; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 787; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 788; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 789; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 790; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 791; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 792; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 793; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 794; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 795; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] 796; GFX9-DL-NEXT: s_endpgm 797; 798; GFX10-DL-LABEL: udot8_acc8: 799; GFX10-DL: ; %bb.0: ; %entry 800; GFX10-DL-NEXT: s_clause 0x1 801; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 802; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 803; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 804; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 805; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 806; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 807; GFX10-DL-NEXT: s_mov_b32 s14, -1 808; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 809; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 810; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 811; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 812; GFX10-DL-NEXT: s_clause 0x1 813; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 814; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] 815; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] 816; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 817; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 818; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 819; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 820; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 821; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 822; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 823; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 824; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 825; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 826; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 827; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 828; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 829; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 830; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 831; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 832; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 833; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 834; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 835; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 836; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 837; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 838; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 839; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 840; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 841; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 842; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 843; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] 844; GFX10-DL-NEXT: s_endpgm 845 ptr addrspace(1) %src2, 846 ptr addrspace(1) nocapture %dst) { 847entry: 848 %idx = call i32 @llvm.amdgcn.workitem.id.x() 849 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 850 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 851 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 852 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 853 854 %v1e0 = extractelement <8 x i4> %vec1, i64 0 855 %cv1e0 = zext i4 %v1e0 to i8 856 %v2e0 = extractelement <8 x i4> %vec2, i64 0 857 %cv2e0 = zext i4 %v2e0 to i8 858 %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0 859 860 %v1e1 = extractelement <8 x i4> %vec1, i64 1 861 %cv1e1 = zext i4 %v1e1 to i8 862 %v2e1 = extractelement <8 x i4> %vec2, i64 1 863 %cv2e1 = zext i4 %v2e1 to i8 864 %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1 865 866 %v1e2 = extractelement <8 x i4> %vec1, i64 2 867 %cv1e2 = zext i4 %v1e2 to i8 868 %v2e2 = extractelement <8 x i4> %vec2, i64 2 869 %cv2e2 = zext i4 %v2e2 to i8 870 %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2 871 872 %v1e3 = extractelement <8 x i4> %vec1, i64 3 873 %cv1e3 = zext i4 %v1e3 to i8 874 %v2e3 = extractelement <8 x i4> %vec2, i64 3 875 %cv2e3 = zext i4 %v2e3 to i8 876 %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3 877 878 %v1e4 = extractelement <8 x i4> %vec1, i64 4 879 %cv1e4 = zext i4 %v1e4 to i8 880 %v2e4 = extractelement <8 x i4> %vec2, i64 4 881 %cv2e4 = zext i4 %v2e4 to i8 882 %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4 883 884 %v1e5 = extractelement <8 x i4> %vec1, i64 5 885 %cv1e5 = zext i4 %v1e5 to i8 886 %v2e5 = extractelement <8 x i4> %vec2, i64 5 887 %cv2e5 = zext i4 %v2e5 to i8 888 %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5 889 890 %v1e6 = extractelement <8 x i4> %vec1, i64 6 891 %cv1e6 = zext i4 %v1e6 to i8 892 %v2e6 = extractelement <8 x i4> %vec2, i64 6 893 %cv2e6 = zext i4 %v2e6 to i8 894 %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6 895 896 %v1e7 = extractelement <8 x i4> %vec1, i64 7 897 %cv1e7 = zext i4 %v1e7 to i8 898 %v2e7 = extractelement <8 x i4> %vec2, i64 7 899 %cv2e7 = zext i4 %v2e7 to i8 900 %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7 901 902 %acc = load i8, ptr addrspace(1) %dst, align 4 903 %add1 = add i8 %mul0, %acc 904 %add2 = add i8 %add1, %mul1 905 %add3 = add i8 %add2, %mul2 906 %add4 = add i8 %add3, %mul3 907 %add5 = add i8 %add4, %mul4 908 %add6 = add i8 %add5, %mul5 909 %add7 = add i8 %add6, %mul6 910 %add8 = add i8 %add7, %mul7 911 912 store i8 %add8, ptr addrspace(1) %dst, align 4 913 ret void 914} 915 916; TODO: Remove the two unnecessary instructions(and+add after 2nd MAD) 917; to have the pattern-recognizer to kick in. 918define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, 919; GFX7-LABEL: udot8_acc4: 920; GFX7: ; %bb.0: ; %entry 921; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 922; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 923; GFX7-NEXT: s_mov_b32 s14, -1 924; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 925; GFX7-NEXT: s_add_u32 s12, s12, s11 926; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 927; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 928; GFX7-NEXT: s_mov_b32 s3, 0xf000 929; GFX7-NEXT: s_mov_b32 s6, 0 930; GFX7-NEXT: s_mov_b32 s7, s3 931; GFX7-NEXT: s_waitcnt lgkmcnt(0) 932; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 933; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 934; GFX7-NEXT: v_mov_b32_e32 v1, 0 935; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 936; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 937; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 938; GFX7-NEXT: s_mov_b32 s2, -1 939; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 940; GFX7-NEXT: s_addc_u32 s13, s13, 0 941; GFX7-NEXT: s_waitcnt vmcnt(2) 942; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 943; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 944; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 945; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 946; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 947; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 948; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 949; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 950; GFX7-NEXT: s_waitcnt vmcnt(1) 951; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 952; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 953; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 954; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 955; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 956; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 957; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 958; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 959; GFX7-NEXT: s_waitcnt vmcnt(0) 960; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 961; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 962; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 963; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 964; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 965; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 966; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 967; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 968; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 969; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 970; GFX7-NEXT: s_endpgm 971; 972; GFX8-LABEL: udot8_acc4: 973; GFX8: ; %bb.0: ; %entry 974; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 975; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 976; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 977; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 978; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 979; GFX8-NEXT: s_waitcnt lgkmcnt(0) 980; GFX8-NEXT: v_mov_b32_e32 v1, s1 981; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 982; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 983; GFX8-NEXT: flat_load_dword v3, v[0:1] 984; GFX8-NEXT: v_mov_b32_e32 v1, s3 985; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 986; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 987; GFX8-NEXT: flat_load_dword v2, v[0:1] 988; GFX8-NEXT: v_mov_b32_e32 v0, s4 989; GFX8-NEXT: v_mov_b32_e32 v1, s5 990; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 991; GFX8-NEXT: s_mov_b32 s14, -1 992; GFX8-NEXT: s_mov_b32 s15, 0xe80000 993; GFX8-NEXT: s_add_u32 s12, s12, s11 994; GFX8-NEXT: s_addc_u32 s13, s13, 0 995; GFX8-NEXT: s_waitcnt vmcnt(2) 996; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 997; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 998; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 999; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 1000; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 1001; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 1002; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 1003; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 1004; GFX8-NEXT: s_waitcnt vmcnt(1) 1005; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 1006; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 1007; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 1008; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 1009; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 1010; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 1011; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 1012; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 1013; GFX8-NEXT: s_waitcnt vmcnt(0) 1014; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 1015; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 1016; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 1017; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 1018; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 1019; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 1020; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 1021; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 1022; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 1023; GFX8-NEXT: flat_store_byte v[0:1], v2 1024; GFX8-NEXT: s_endpgm 1025; 1026; GFX9-LABEL: udot8_acc4: 1027; GFX9: ; %bb.0: ; %entry 1028; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1029; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1030; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1031; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1032; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1033; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1034; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 1035; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 1036; GFX9-NEXT: v_mov_b32_e32 v0, 0 1037; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] 1038; GFX9-NEXT: s_mov_b32 s14, -1 1039; GFX9-NEXT: s_mov_b32 s15, 0xe00000 1040; GFX9-NEXT: s_add_u32 s12, s12, s11 1041; GFX9-NEXT: s_addc_u32 s13, s13, 0 1042; GFX9-NEXT: s_waitcnt vmcnt(2) 1043; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 1044; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 1045; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 1046; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 1047; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 1048; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 1049; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 1050; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 1051; GFX9-NEXT: s_waitcnt vmcnt(1) 1052; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 1053; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 1054; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 1055; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 1056; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 1057; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 1058; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 1059; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 1060; GFX9-NEXT: s_waitcnt vmcnt(0) 1061; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1062; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 1063; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 1064; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 1065; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 1066; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 1067; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 1068; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 1069; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 1070; GFX9-NEXT: global_store_byte v0, v1, s[6:7] 1071; GFX9-NEXT: s_endpgm 1072; 1073; GFX9-DL-LABEL: udot8_acc4: 1074; GFX9-DL: ; %bb.0: ; %entry 1075; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1076; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1077; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1078; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1079; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1080; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1081; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1082; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1083; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1084; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] 1085; GFX9-DL-NEXT: s_mov_b32 s14, -1 1086; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 1087; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 1088; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 1089; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 1090; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 1091; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 1092; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 1093; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 1094; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 1095; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 1096; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 1097; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 1098; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1099; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 1100; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2 1101; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 1102; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 1103; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 1104; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 1105; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 1106; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 1107; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1108; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1109; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 1110; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 1111; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 1112; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 1113; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 1114; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 1115; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 1116; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 1117; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] 1118; GFX9-DL-NEXT: s_endpgm 1119; 1120; GFX10-DL-LABEL: udot8_acc4: 1121; GFX10-DL: ; %bb.0: ; %entry 1122; GFX10-DL-NEXT: s_clause 0x1 1123; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1124; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1125; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1126; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 1127; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1128; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1129; GFX10-DL-NEXT: s_mov_b32 s14, -1 1130; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 1131; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 1132; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 1133; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1134; GFX10-DL-NEXT: s_clause 0x1 1135; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 1136; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] 1137; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] 1138; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 1139; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 1140; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1141; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 1142; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 1143; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 1144; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1145; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 1146; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 1147; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 1148; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 1149; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 1150; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 1151; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 1152; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 1153; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 1154; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 1155; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 1156; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 1157; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 1158; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 1159; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 1160; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 1161; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 1162; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 1163; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 1164; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 1165; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 1166; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] 1167; GFX10-DL-NEXT: s_endpgm 1168 ptr addrspace(1) %src2, 1169 ptr addrspace(1) nocapture %dst) { 1170entry: 1171 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1172 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 1173 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 1174 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 1175 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 1176 1177 %v1e0 = extractelement <8 x i4> %vec1, i64 0 1178 %v2e0 = extractelement <8 x i4> %vec2, i64 0 1179 %mul0 = mul nuw nsw i4 %v1e0, %v2e0 1180 1181 %v1e1 = extractelement <8 x i4> %vec1, i64 1 1182 %v2e1 = extractelement <8 x i4> %vec2, i64 1 1183 %mul1 = mul nuw nsw i4 %v1e1, %v2e1 1184 1185 %v1e2 = extractelement <8 x i4> %vec1, i64 2 1186 %v2e2 = extractelement <8 x i4> %vec2, i64 2 1187 %mul2 = mul nuw nsw i4 %v1e2, %v2e2 1188 1189 %v1e3 = extractelement <8 x i4> %vec1, i64 3 1190 %v2e3 = extractelement <8 x i4> %vec2, i64 3 1191 %mul3 = mul nuw nsw i4 %v1e3, %v2e3 1192 1193 %v1e4 = extractelement <8 x i4> %vec1, i64 4 1194 %v2e4 = extractelement <8 x i4> %vec2, i64 4 1195 %mul4 = mul nuw nsw i4 %v1e4, %v2e4 1196 1197 %v1e5 = extractelement <8 x i4> %vec1, i64 5 1198 %v2e5 = extractelement <8 x i4> %vec2, i64 5 1199 %mul5 = mul nuw nsw i4 %v1e5, %v2e5 1200 1201 %v1e6 = extractelement <8 x i4> %vec1, i64 6 1202 %v2e6 = extractelement <8 x i4> %vec2, i64 6 1203 %mul6 = mul nuw nsw i4 %v1e6, %v2e6 1204 1205 %v1e7 = extractelement <8 x i4> %vec1, i64 7 1206 %v2e7 = extractelement <8 x i4> %vec2, i64 7 1207 %mul7 = mul nuw nsw i4 %v1e7, %v2e7 1208 1209 %acc = load i4, ptr addrspace(1) %dst, align 4 1210 %add1 = add i4 %mul0, %acc 1211 %add2 = add i4 %add1, %mul1 1212 %add3 = add i4 %add2, %mul2 1213 %add4 = add i4 %add3, %mul3 1214 %add5 = add i4 %add4, %mul4 1215 %add6 = add i4 %add5, %mul5 1216 %add7 = add i4 %add6, %mul6 1217 %add8 = add i4 %add7, %mul7 1218 1219 store i4 %add8, ptr addrspace(1) %dst, align 4 1220 ret void 1221} 1222 1223; TODO: Currently, permutation of udot8 is turned off due to a huge increase 1224; in the compile time. 1225define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, 1226; GFX7-LABEL: udot8_CommutationInsideMAD: 1227; GFX7: ; %bb.0: ; %entry 1228; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1229; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1230; GFX7-NEXT: s_mov_b32 s14, -1 1231; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 1232; GFX7-NEXT: s_add_u32 s12, s12, s11 1233; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1234; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1235; GFX7-NEXT: s_mov_b32 s3, 0xf000 1236; GFX7-NEXT: s_mov_b32 s6, 0 1237; GFX7-NEXT: s_mov_b32 s7, s3 1238; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1239; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 1240; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1241; GFX7-NEXT: v_mov_b32_e32 v1, 0 1242; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1243; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 1244; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1245; GFX7-NEXT: s_mov_b32 s2, -1 1246; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 1247; GFX7-NEXT: s_addc_u32 s13, s13, 0 1248; GFX7-NEXT: s_waitcnt vmcnt(2) 1249; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 1250; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 1251; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 1252; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 1253; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 1254; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 1255; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 1256; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 1257; GFX7-NEXT: s_waitcnt vmcnt(1) 1258; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 1259; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 1260; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 1261; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 1262; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 1263; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 1264; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 1265; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 1266; GFX7-NEXT: s_waitcnt vmcnt(0) 1267; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 1268; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 1269; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 1270; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 1271; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 1272; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 1273; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 1274; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 1275; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 1276; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 1277; GFX7-NEXT: s_endpgm 1278; 1279; GFX8-LABEL: udot8_CommutationInsideMAD: 1280; GFX8: ; %bb.0: ; %entry 1281; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1282; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1283; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1284; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1285; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1286; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1287; GFX8-NEXT: v_mov_b32_e32 v1, s1 1288; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1289; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1290; GFX8-NEXT: flat_load_dword v3, v[0:1] 1291; GFX8-NEXT: v_mov_b32_e32 v1, s3 1292; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1293; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1294; GFX8-NEXT: flat_load_dword v2, v[0:1] 1295; GFX8-NEXT: v_mov_b32_e32 v0, s4 1296; GFX8-NEXT: v_mov_b32_e32 v1, s5 1297; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 1298; GFX8-NEXT: s_mov_b32 s14, -1 1299; GFX8-NEXT: s_mov_b32 s15, 0xe80000 1300; GFX8-NEXT: s_add_u32 s12, s12, s11 1301; GFX8-NEXT: s_addc_u32 s13, s13, 0 1302; GFX8-NEXT: s_waitcnt vmcnt(2) 1303; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 1304; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 1305; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 1306; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 1307; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 1308; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 1309; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 1310; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 1311; GFX8-NEXT: s_waitcnt vmcnt(1) 1312; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 1313; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 1314; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 1315; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 1316; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 1317; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 1318; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 1319; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 1320; GFX8-NEXT: s_waitcnt vmcnt(0) 1321; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 1322; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 1323; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 1324; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 1325; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 1326; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 1327; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 1328; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 1329; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 1330; GFX8-NEXT: flat_store_byte v[0:1], v2 1331; GFX8-NEXT: s_endpgm 1332; 1333; GFX9-LABEL: udot8_CommutationInsideMAD: 1334; GFX9: ; %bb.0: ; %entry 1335; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1336; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1337; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1338; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1339; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1340; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1341; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 1342; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 1343; GFX9-NEXT: v_mov_b32_e32 v0, 0 1344; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] 1345; GFX9-NEXT: s_mov_b32 s14, -1 1346; GFX9-NEXT: s_mov_b32 s15, 0xe00000 1347; GFX9-NEXT: s_add_u32 s12, s12, s11 1348; GFX9-NEXT: s_addc_u32 s13, s13, 0 1349; GFX9-NEXT: s_waitcnt vmcnt(2) 1350; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 1351; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 1352; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 1353; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 1354; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 1355; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 1356; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 1357; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 1358; GFX9-NEXT: s_waitcnt vmcnt(1) 1359; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 1360; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 1361; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 1362; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 1363; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 1364; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 1365; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 1366; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 1367; GFX9-NEXT: s_waitcnt vmcnt(0) 1368; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1369; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 1370; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 1371; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 1372; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 1373; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 1374; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 1375; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 1376; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 1377; GFX9-NEXT: global_store_byte v0, v1, s[6:7] 1378; GFX9-NEXT: s_endpgm 1379; 1380; GFX9-DL-LABEL: udot8_CommutationInsideMAD: 1381; GFX9-DL: ; %bb.0: ; %entry 1382; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1383; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1384; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1385; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1386; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1387; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1388; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1389; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1390; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1391; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] 1392; GFX9-DL-NEXT: s_mov_b32 s14, -1 1393; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 1394; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 1395; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 1396; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 1397; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 1398; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 1399; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 1400; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 1401; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 1402; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 1403; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 1404; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 1405; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1406; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 1407; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2 1408; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 1409; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 1410; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 1411; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 1412; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 1413; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 1414; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1415; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1416; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 1417; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 1418; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 1419; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 1420; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 1421; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 1422; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 1423; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 1424; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] 1425; GFX9-DL-NEXT: s_endpgm 1426; 1427; GFX10-DL-LABEL: udot8_CommutationInsideMAD: 1428; GFX10-DL: ; %bb.0: ; %entry 1429; GFX10-DL-NEXT: s_clause 0x1 1430; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1431; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1432; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1433; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 1434; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1435; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1436; GFX10-DL-NEXT: s_mov_b32 s14, -1 1437; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 1438; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 1439; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 1440; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1441; GFX10-DL-NEXT: s_clause 0x1 1442; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 1443; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] 1444; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] 1445; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 1446; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 1447; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1448; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 1449; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 1450; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 1451; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1452; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 1453; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 1454; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 1455; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 1456; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 1457; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 1458; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 1459; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 1460; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 1461; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 1462; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 1463; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 1464; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 1465; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 1466; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 1467; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 1468; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 1469; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 1470; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 1471; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 1472; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 1473; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] 1474; GFX10-DL-NEXT: s_endpgm 1475 ptr addrspace(1) %src2, 1476 ptr addrspace(1) nocapture %dst) { 1477entry: 1478 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1479 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 1480 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 1481 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 1482 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 1483 1484 %v1e0 = extractelement <8 x i4> %vec1, i64 0 1485 %v2e0 = extractelement <8 x i4> %vec2, i64 0 1486 %mul0 = mul nuw nsw i4 %v1e0, %v2e0 1487 1488 %v1e1 = extractelement <8 x i4> %vec1, i64 1 1489 %v2e1 = extractelement <8 x i4> %vec2, i64 1 1490 %mul1 = mul nuw nsw i4 %v1e1, %v2e1 1491 1492 %v1e2 = extractelement <8 x i4> %vec1, i64 2 1493 %v2e2 = extractelement <8 x i4> %vec2, i64 2 1494 %mul2 = mul nuw nsw i4 %v1e2, %v2e2 1495 1496 %v1e3 = extractelement <8 x i4> %vec1, i64 3 1497 %v2e3 = extractelement <8 x i4> %vec2, i64 3 1498 %mul3 = mul nuw nsw i4 %v1e3, %v2e3 1499 1500 %v1e4 = extractelement <8 x i4> %vec1, i64 4 1501 %v2e4 = extractelement <8 x i4> %vec2, i64 4 1502 %mul4 = mul nuw nsw i4 %v1e4, %v2e4 1503 1504 %v1e5 = extractelement <8 x i4> %vec1, i64 5 1505 %v2e5 = extractelement <8 x i4> %vec2, i64 5 1506 %mul5 = mul nuw nsw i4 %v1e5, %v2e5 1507 1508 %v1e6 = extractelement <8 x i4> %vec1, i64 6 1509 %v2e6 = extractelement <8 x i4> %vec2, i64 6 1510 %mul6 = mul nuw nsw i4 %v1e6, %v2e6 1511 1512 %v1e7 = extractelement <8 x i4> %vec1, i64 7 1513 %v2e7 = extractelement <8 x i4> %vec2, i64 7 1514 %mul7 = mul nuw nsw i4 %v1e7, %v2e7 1515 1516 %acc = load i4, ptr addrspace(1) %dst, align 4 1517 %add1 = add i4 %mul0, %acc 1518 %add2 = add i4 %mul1, %add1 1519 %add3 = add i4 %mul2, %add2 1520 %add4 = add i4 %mul3, %add3 1521 %add5 = add i4 %mul4, %add4 1522 %add6 = add i4 %mul5, %add5 1523 %add7 = add i4 %mul6, %add6 1524 %add8 = add i4 %mul7, %add7 1525 1526 store i4 %add8, ptr addrspace(1) %dst, align 4 1527 ret void 1528} 1529 1530define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, 1531; GFX7-LABEL: udot8_multiuses_mul1: 1532; GFX7: ; %bb.0: ; %entry 1533; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1534; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1535; GFX7-NEXT: s_mov_b32 s14, -1 1536; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 1537; GFX7-NEXT: s_add_u32 s12, s12, s11 1538; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1539; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1540; GFX7-NEXT: s_mov_b32 s3, 0xf000 1541; GFX7-NEXT: s_mov_b32 s6, 0 1542; GFX7-NEXT: s_mov_b32 s7, s3 1543; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1544; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 1545; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1546; GFX7-NEXT: v_mov_b32_e32 v1, 0 1547; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1548; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 1549; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1550; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1551; GFX7-NEXT: s_mov_b32 s2, -1 1552; GFX7-NEXT: s_addc_u32 s13, s13, 0 1553; GFX7-NEXT: s_waitcnt vmcnt(1) 1554; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 1555; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 1556; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 1557; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 1558; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 1559; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 1560; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 1561; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 1562; GFX7-NEXT: s_waitcnt vmcnt(0) 1563; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 1564; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 1565; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 1566; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 1567; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 1568; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 1569; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 1570; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 1571; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1572; GFX7-NEXT: v_mad_u32_u24 v16, v2, v0, s4 1573; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16 1574; GFX7-NEXT: v_mad_u32_u24 v2, v8, v15, v16 1575; GFX7-NEXT: v_mad_u32_u24 v2, v7, v14, v2 1576; GFX7-NEXT: v_mad_u32_u24 v2, v6, v13, v2 1577; GFX7-NEXT: v_mad_u32_u24 v2, v5, v12, v2 1578; GFX7-NEXT: v_mad_u32_u24 v2, v4, v11, v2 1579; GFX7-NEXT: v_mad_u32_u24 v2, v3, v10, v2 1580; GFX7-NEXT: v_mad_u32_u24 v1, v1, v9, v2 1581; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1582; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1583; GFX7-NEXT: s_endpgm 1584; 1585; GFX8-LABEL: udot8_multiuses_mul1: 1586; GFX8: ; %bb.0: ; %entry 1587; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1588; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1589; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1590; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1591; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1592; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1593; GFX8-NEXT: v_mov_b32_e32 v1, s1 1594; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1595; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1596; GFX8-NEXT: flat_load_dword v3, v[0:1] 1597; GFX8-NEXT: v_mov_b32_e32 v1, s3 1598; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1599; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1600; GFX8-NEXT: flat_load_dword v0, v[0:1] 1601; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1602; GFX8-NEXT: s_mov_b32 s14, -1 1603; GFX8-NEXT: s_mov_b32 s15, 0xe80000 1604; GFX8-NEXT: s_add_u32 s12, s12, s11 1605; GFX8-NEXT: s_addc_u32 s13, s13, 0 1606; GFX8-NEXT: s_waitcnt vmcnt(1) 1607; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 1608; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 1609; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4 1610; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4 1611; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4 1612; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 1613; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4 1614; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 1615; GFX8-NEXT: s_waitcnt vmcnt(0) 1616; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0 1617; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4 1618; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4 1619; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4 1620; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4 1621; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4 1622; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4 1623; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 1624; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1625; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s0 1626; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v16 1627; GFX8-NEXT: v_mad_u32_u24 v3, v8, v15, v16 1628; GFX8-NEXT: v_mad_u32_u24 v3, v7, v14, v3 1629; GFX8-NEXT: v_mad_u32_u24 v3, v6, v13, v3 1630; GFX8-NEXT: v_mad_u32_u24 v3, v5, v12, v3 1631; GFX8-NEXT: v_mad_u32_u24 v3, v4, v11, v3 1632; GFX8-NEXT: v_mad_u32_u24 v2, v2, v10, v3 1633; GFX8-NEXT: v_mad_u32_u24 v1, v1, v9, v2 1634; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 1635; GFX8-NEXT: v_mov_b32_e32 v0, s4 1636; GFX8-NEXT: v_mov_b32_e32 v1, s5 1637; GFX8-NEXT: flat_store_dword v[0:1], v2 1638; GFX8-NEXT: s_endpgm 1639; 1640; GFX9-LABEL: udot8_multiuses_mul1: 1641; GFX9: ; %bb.0: ; %entry 1642; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1643; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1644; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1645; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1646; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1647; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1648; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 1649; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 1650; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 1651; GFX9-NEXT: s_mov_b32 s14, -1 1652; GFX9-NEXT: s_mov_b32 s15, 0xe00000 1653; GFX9-NEXT: s_add_u32 s12, s12, s11 1654; GFX9-NEXT: v_mov_b32_e32 v0, 0 1655; GFX9-NEXT: s_addc_u32 s13, s13, 0 1656; GFX9-NEXT: s_waitcnt vmcnt(1) 1657; GFX9-NEXT: v_bfe_u32 v3, v1, 4, 4 1658; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 1659; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 1660; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 1661; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 1662; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 1663; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 1664; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 1665; GFX9-NEXT: s_waitcnt vmcnt(0) 1666; GFX9-NEXT: v_bfe_u32 v10, v2, 4, 4 1667; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 1668; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 1669; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 1670; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 1671; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 1672; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 1673; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 1674; GFX9-NEXT: v_mul_u32_u24_e32 v17, v1, v2 1675; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1676; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s0 1677; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v16 1678; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 1679; GFX9-NEXT: v_mad_u32_u24 v2, v3, v10, v1 1680; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 1681; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 1682; GFX9-NEXT: v_add3_u32 v2, v2, v9, v8 1683; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 1684; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11 1685; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6 1686; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4 1687; GFX9-NEXT: v_add3_u32 v1, v17, v1, v2 1688; GFX9-NEXT: global_store_dword v0, v1, s[6:7] 1689; GFX9-NEXT: s_endpgm 1690; 1691; GFX9-DL-LABEL: udot8_multiuses_mul1: 1692; GFX9-DL: ; %bb.0: ; %entry 1693; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1694; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1695; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1696; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1697; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1698; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1699; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1700; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1701; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1702; GFX9-DL-NEXT: s_mov_b32 s14, -1 1703; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 1704; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 1705; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1706; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 1707; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1708; GFX9-DL-NEXT: v_bfe_u32 v3, v1, 4, 4 1709; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 1710; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 1711; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 1712; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 1713; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 1714; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 1715; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 1716; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1717; GFX9-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 1718; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 1719; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 1720; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 1721; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 1722; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 1723; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 1724; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 1725; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v1, v2 1726; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1727; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0 1728; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v16 1729; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15 1730; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v10, v1 1731; GFX9-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v14 1732; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v13 1733; GFX9-DL-NEXT: v_add3_u32 v2, v2, v9, v8 1734; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v12 1735; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v11 1736; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6 1737; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4 1738; GFX9-DL-NEXT: v_add3_u32 v1, v17, v1, v2 1739; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1740; GFX9-DL-NEXT: s_endpgm 1741; 1742; GFX10-DL-LABEL: udot8_multiuses_mul1: 1743; GFX10-DL: ; %bb.0: ; %entry 1744; GFX10-DL-NEXT: s_clause 0x1 1745; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1746; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1747; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1748; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1749; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1750; GFX10-DL-NEXT: s_mov_b32 s14, -1 1751; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 1752; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 1753; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 1754; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1755; GFX10-DL-NEXT: s_clause 0x1 1756; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 1757; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 1758; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 1759; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1760; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1761; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v1 1762; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1763; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v2 1764; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 1765; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1 1766; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4 1767; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4 1768; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 16, 4 1769; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 1770; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 8, 4 1771; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 1772; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 1773; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 12, 4 1774; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1775; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s0 1776; GFX10-DL-NEXT: v_bfe_u32 v14, v2, 20, 4 1777; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 1778; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v11 1779; GFX10-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v12 1780; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v10, v13 1781; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 1782; GFX10-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 1783; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v15 1784; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v14 1785; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v7 1786; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v4, v2 1787; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v10 1788; GFX10-DL-NEXT: v_mul_u32_u24_e32 v3, v8, v9 1789; GFX10-DL-NEXT: v_add3_u32 v0, v0, v6, v5 1790; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2 1791; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 1792; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0 1793; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7] 1794; GFX10-DL-NEXT: s_endpgm 1795 ptr addrspace(1) %src2, 1796 ptr addrspace(1) nocapture %dst) { 1797entry: 1798 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1799 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 1800 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 1801 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 1802 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 1803 1804 %v1e0 = extractelement <8 x i4> %vec1, i64 0 1805 %cv1e0 = zext i4 %v1e0 to i32 1806 %v2e0 = extractelement <8 x i4> %vec2, i64 0 1807 %cv2e0 = zext i4 %v2e0 to i32 1808 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 1809 1810 %v1e1 = extractelement <8 x i4> %vec1, i64 1 1811 %cv1e1 = zext i4 %v1e1 to i32 1812 %v2e1 = extractelement <8 x i4> %vec2, i64 1 1813 %cv2e1 = zext i4 %v2e1 to i32 1814 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 1815 1816 %v1e2 = extractelement <8 x i4> %vec1, i64 2 1817 %cv1e2 = zext i4 %v1e2 to i32 1818 %v2e2 = extractelement <8 x i4> %vec2, i64 2 1819 %cv2e2 = zext i4 %v2e2 to i32 1820 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 1821 1822 %v1e3 = extractelement <8 x i4> %vec1, i64 3 1823 %cv1e3 = zext i4 %v1e3 to i32 1824 %v2e3 = extractelement <8 x i4> %vec2, i64 3 1825 %cv2e3 = zext i4 %v2e3 to i32 1826 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 1827 1828 %v1e4 = extractelement <8 x i4> %vec1, i64 4 1829 %cv1e4 = zext i4 %v1e4 to i32 1830 %v2e4 = extractelement <8 x i4> %vec2, i64 4 1831 %cv2e4 = zext i4 %v2e4 to i32 1832 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 1833 1834 %v1e5 = extractelement <8 x i4> %vec1, i64 5 1835 %cv1e5 = zext i4 %v1e5 to i32 1836 %v2e5 = extractelement <8 x i4> %vec2, i64 5 1837 %cv2e5 = zext i4 %v2e5 to i32 1838 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 1839 1840 %v1e6 = extractelement <8 x i4> %vec1, i64 6 1841 %cv1e6 = zext i4 %v1e6 to i32 1842 %v2e6 = extractelement <8 x i4> %vec2, i64 6 1843 %cv2e6 = zext i4 %v2e6 to i32 1844 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 1845 1846 %v1e7 = extractelement <8 x i4> %vec1, i64 7 1847 %cv1e7 = zext i4 %v1e7 to i32 1848 %v2e7 = extractelement <8 x i4> %vec2, i64 7 1849 %cv2e7 = zext i4 %v2e7 to i32 1850 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 1851 1852 %acc = load i32, ptr addrspace(1) %dst, align 4 1853 %add1 = add i32 %mul0, %acc 1854 %add = add i32 %mul0, %add1 1855 %add2 = add i32 %add1, %mul1 1856 %add3 = add i32 %add2, %mul2 1857 %add4 = add i32 %add3, %mul3 1858 %add5 = add i32 %add4, %mul4 1859 %add6 = add i32 %add5, %mul5 1860 %add7 = add i32 %add6, %mul6 1861 %add8 = add i32 %add7, %mul7 1862 1863 %res = add i32 %add, %add8 1864 store i32 %res, ptr addrspace(1) %dst, align 4 1865 ret void 1866} 1867 1868define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, 1869; GFX7-LABEL: udot8_acc32_vecMul: 1870; GFX7: ; %bb.0: ; %entry 1871; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1872; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1873; GFX7-NEXT: s_mov_b32 s14, -1 1874; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 1875; GFX7-NEXT: s_add_u32 s12, s12, s11 1876; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1877; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1878; GFX7-NEXT: s_mov_b32 s3, 0xf000 1879; GFX7-NEXT: s_mov_b32 s6, 0 1880; GFX7-NEXT: s_mov_b32 s7, s3 1881; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1882; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 1883; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1884; GFX7-NEXT: v_mov_b32_e32 v1, 0 1885; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1886; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 1887; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1888; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1889; GFX7-NEXT: s_mov_b32 s2, -1 1890; GFX7-NEXT: s_addc_u32 s13, s13, 0 1891; GFX7-NEXT: s_waitcnt vmcnt(1) 1892; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 1893; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 1894; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 1895; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 1896; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 1897; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 1898; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 1899; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 1900; GFX7-NEXT: s_waitcnt vmcnt(0) 1901; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 1902; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 1903; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 1904; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 1905; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 1906; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 1907; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 1908; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 1909; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1910; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4 1911; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 1912; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 1913; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 1914; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 1915; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 1916; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 1917; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0 1918; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1919; GFX7-NEXT: s_endpgm 1920; 1921; GFX8-LABEL: udot8_acc32_vecMul: 1922; GFX8: ; %bb.0: ; %entry 1923; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1924; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1925; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1926; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1927; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1928; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1929; GFX8-NEXT: v_mov_b32_e32 v1, s1 1930; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1931; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1932; GFX8-NEXT: flat_load_dword v3, v[0:1] 1933; GFX8-NEXT: v_mov_b32_e32 v1, s3 1934; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1935; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1936; GFX8-NEXT: flat_load_dword v0, v[0:1] 1937; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1938; GFX8-NEXT: s_mov_b32 s14, -1 1939; GFX8-NEXT: s_mov_b32 s15, 0xe80000 1940; GFX8-NEXT: s_add_u32 s12, s12, s11 1941; GFX8-NEXT: s_addc_u32 s13, s13, 0 1942; GFX8-NEXT: s_waitcnt vmcnt(1) 1943; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 1944; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 1945; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4 1946; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4 1947; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4 1948; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 1949; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4 1950; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 1951; GFX8-NEXT: s_waitcnt vmcnt(0) 1952; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0 1953; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4 1954; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4 1955; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4 1956; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4 1957; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4 1958; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4 1959; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 1960; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1961; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0 1962; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0 1963; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0 1964; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0 1965; GFX8-NEXT: v_mad_u32_u24 v0, v5, v12, v0 1966; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0 1967; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0 1968; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0 1969; GFX8-NEXT: v_mov_b32_e32 v0, s4 1970; GFX8-NEXT: v_mov_b32_e32 v1, s5 1971; GFX8-NEXT: flat_store_dword v[0:1], v2 1972; GFX8-NEXT: s_endpgm 1973; 1974; GFX9-LABEL: udot8_acc32_vecMul: 1975; GFX9: ; %bb.0: ; %entry 1976; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1977; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1978; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1979; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1980; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1981; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1982; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 1983; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 1984; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 1985; GFX9-NEXT: s_mov_b32 s14, -1 1986; GFX9-NEXT: s_mov_b32 s15, 0xe00000 1987; GFX9-NEXT: s_add_u32 s12, s12, s11 1988; GFX9-NEXT: v_mov_b32_e32 v0, 0 1989; GFX9-NEXT: s_addc_u32 s13, s13, 0 1990; GFX9-NEXT: s_waitcnt vmcnt(1) 1991; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 1992; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4 1993; GFX9-NEXT: v_bfe_u32 v5, v1, 20, 4 1994; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 4 1995; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 1996; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 1997; GFX9-NEXT: v_bfe_u32 v9, v1, 4, 4 1998; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 1999; GFX9-NEXT: s_waitcnt vmcnt(0) 2000; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 2001; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4 2002; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 2003; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4 2004; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 2005; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 2006; GFX9-NEXT: v_bfe_u32 v16, v2, 4, 4 2007; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 2008; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2 2009; GFX9-NEXT: v_mul_u32_u24_e32 v2, v9, v16 2010; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 2011; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 2012; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2013; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 2014; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 2015; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 2016; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 2017; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11 2018; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10 2019; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 2020; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 2021; GFX9-NEXT: global_store_dword v0, v1, s[6:7] 2022; GFX9-NEXT: s_endpgm 2023; 2024; GFX9-DL-LABEL: udot8_acc32_vecMul: 2025; GFX9-DL: ; %bb.0: ; %entry 2026; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2027; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2028; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2029; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2030; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2031; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2032; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 2033; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 2034; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2035; GFX9-DL-NEXT: s_mov_b32 s14, -1 2036; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 2037; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 2038; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2039; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 2040; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2041; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 2042; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 2043; GFX9-DL-NEXT: s_endpgm 2044; 2045; GFX10-DL-LABEL: udot8_acc32_vecMul: 2046; GFX10-DL: ; %bb.0: ; %entry 2047; GFX10-DL-NEXT: s_clause 0x1 2048; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2049; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2050; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2051; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2052; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2053; GFX10-DL-NEXT: s_mov_b32 s14, -1 2054; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 2055; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 2056; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 2057; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2058; GFX10-DL-NEXT: s_clause 0x1 2059; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 2060; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 2061; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 2062; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2063; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2064; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2065; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 2066; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] 2067; GFX10-DL-NEXT: s_endpgm 2068 ptr addrspace(1) %src2, 2069 ptr addrspace(1) nocapture %dst) { 2070entry: 2071 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2072 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 2073 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 2074 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 2075 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 2076 2077 %cvec1 = zext <8 x i4> %vec1 to <8 x i32> 2078 %cvec2 = zext <8 x i4> %vec2 to <8 x i32> 2079 2080 %mul = mul <8 x i32> %cvec1, %cvec2 2081 %mul0 = extractelement <8 x i32> %mul, i64 0 2082 %mul1 = extractelement <8 x i32> %mul, i64 1 2083 %mul2 = extractelement <8 x i32> %mul, i64 2 2084 %mul3 = extractelement <8 x i32> %mul, i64 3 2085 %mul4 = extractelement <8 x i32> %mul, i64 4 2086 %mul5 = extractelement <8 x i32> %mul, i64 5 2087 %mul6 = extractelement <8 x i32> %mul, i64 6 2088 %mul7 = extractelement <8 x i32> %mul, i64 7 2089 2090 %acc = load i32, ptr addrspace(1) %dst, align 4 2091 %add1 = add i32 %mul0, %acc 2092 %add2 = add i32 %add1, %mul1 2093 %add3 = add i32 %add2, %mul2 2094 %add4 = add i32 %add3, %mul3 2095 %add5 = add i32 %add4, %mul4 2096 %add6 = add i32 %add5, %mul5 2097 %add7 = add i32 %add6, %mul6 2098 %add8 = add i32 %add7, %mul7 2099 2100 store i32 %add8, ptr addrspace(1) %dst, align 4 2101 ret void 2102} 2103 2104; TODO: Clean up the code(by default pk_mad_I16 should be generated), then 2105; support the pattern. 2106define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, 2107; GFX7-LABEL: udot8_acc16_vecMul: 2108; GFX7: ; %bb.0: ; %entry 2109; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2110; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2111; GFX7-NEXT: s_mov_b32 s14, -1 2112; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 2113; GFX7-NEXT: s_add_u32 s12, s12, s11 2114; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2115; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2116; GFX7-NEXT: s_mov_b32 s3, 0xf000 2117; GFX7-NEXT: s_mov_b32 s6, 0 2118; GFX7-NEXT: s_mov_b32 s7, s3 2119; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2120; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2121; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2122; GFX7-NEXT: v_mov_b32_e32 v1, 0 2123; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2124; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 2125; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2126; GFX7-NEXT: s_mov_b32 s2, -1 2127; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 2128; GFX7-NEXT: s_addc_u32 s13, s13, 0 2129; GFX7-NEXT: s_waitcnt vmcnt(2) 2130; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 2131; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 2132; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 2133; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 2134; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 2135; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 2136; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 2137; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 2138; GFX7-NEXT: s_waitcnt vmcnt(1) 2139; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 2140; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 2141; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 2142; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 2143; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 2144; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 2145; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 2146; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 2147; GFX7-NEXT: s_waitcnt vmcnt(0) 2148; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2149; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 2150; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 2151; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 2152; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 2153; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 2154; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 2155; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 2156; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 2157; GFX7-NEXT: s_endpgm 2158; 2159; GFX8-LABEL: udot8_acc16_vecMul: 2160; GFX8: ; %bb.0: ; %entry 2161; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2162; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2163; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2164; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2165; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2166; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2167; GFX8-NEXT: v_mov_b32_e32 v1, s1 2168; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2169; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2170; GFX8-NEXT: flat_load_dword v3, v[0:1] 2171; GFX8-NEXT: v_mov_b32_e32 v1, s3 2172; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2173; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2174; GFX8-NEXT: flat_load_dword v2, v[0:1] 2175; GFX8-NEXT: v_mov_b32_e32 v0, s4 2176; GFX8-NEXT: v_mov_b32_e32 v1, s5 2177; GFX8-NEXT: flat_load_ushort v4, v[0:1] 2178; GFX8-NEXT: s_mov_b32 s14, -1 2179; GFX8-NEXT: s_mov_b32 s15, 0xe80000 2180; GFX8-NEXT: s_add_u32 s12, s12, s11 2181; GFX8-NEXT: s_addc_u32 s13, s13, 0 2182; GFX8-NEXT: s_waitcnt vmcnt(2) 2183; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 2184; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 2185; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 2186; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 2187; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 2188; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 2189; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 2190; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 2191; GFX8-NEXT: s_waitcnt vmcnt(1) 2192; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 2193; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4 2194; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 2195; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 2196; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 2197; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 2198; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 2199; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 2200; GFX8-NEXT: s_waitcnt vmcnt(0) 2201; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 2202; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 2203; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 2204; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 2205; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 2206; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 2207; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 2208; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 2209; GFX8-NEXT: flat_store_short v[0:1], v2 2210; GFX8-NEXT: s_endpgm 2211; 2212; GFX9-LABEL: udot8_acc16_vecMul: 2213; GFX9: ; %bb.0: ; %entry 2214; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2215; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2216; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2217; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2218; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2219; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2220; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 2221; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 2222; GFX9-NEXT: v_mov_b32_e32 v0, 0 2223; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] 2224; GFX9-NEXT: s_mov_b32 s0, 0x5040100 2225; GFX9-NEXT: s_mov_b32 s14, -1 2226; GFX9-NEXT: s_mov_b32 s15, 0xe00000 2227; GFX9-NEXT: s_add_u32 s12, s12, s11 2228; GFX9-NEXT: s_addc_u32 s13, s13, 0 2229; GFX9-NEXT: s_waitcnt vmcnt(2) 2230; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 2231; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 2232; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4 2233; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 2234; GFX9-NEXT: s_waitcnt vmcnt(1) 2235; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 2236; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 2237; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 2238; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 2239; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 2240; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 2241; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 2242; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 2243; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 2244; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 2245; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 2246; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 2247; GFX9-NEXT: s_waitcnt vmcnt(0) 2248; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 2249; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 2250; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 2251; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 2252; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 2253; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4 2254; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 2255; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 2256; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2257; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 2258; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 2259; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 2260; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 2261; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2262; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 2263; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2264; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 2265; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2266; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 2267; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2268; GFX9-NEXT: global_store_short v0, v1, s[6:7] 2269; GFX9-NEXT: s_endpgm 2270; 2271; GFX9-DL-LABEL: udot8_acc16_vecMul: 2272; GFX9-DL: ; %bb.0: ; %entry 2273; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2274; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2275; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2276; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2277; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2278; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2279; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 2280; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 2281; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2282; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] 2283; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 2284; GFX9-DL-NEXT: s_mov_b32 s14, -1 2285; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 2286; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 2287; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 2288; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 2289; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 2290; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 2291; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 2292; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 2293; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2294; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 2295; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 2296; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 2297; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 2298; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 2299; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 2300; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 2301; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 2302; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 2303; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 2304; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 2305; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 2306; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2307; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 2308; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 2309; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 2310; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 2311; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 2312; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4 2313; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 2314; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 2315; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2316; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 2317; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 2318; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 2319; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 2320; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2321; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 2322; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2323; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 2324; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2325; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 2326; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2327; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] 2328; GFX9-DL-NEXT: s_endpgm 2329; 2330; GFX10-DL-LABEL: udot8_acc16_vecMul: 2331; GFX10-DL: ; %bb.0: ; %entry 2332; GFX10-DL-NEXT: s_clause 0x1 2333; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2334; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2335; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2336; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2337; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2338; GFX10-DL-NEXT: s_mov_b32 s14, -1 2339; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 2340; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 2341; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 2342; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2343; GFX10-DL-NEXT: s_clause 0x1 2344; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 2345; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 2346; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2347; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] 2348; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 2349; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 2350; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2351; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 2352; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 2353; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 2354; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 2355; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 2356; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 2357; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 2358; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 2359; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 2360; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 2361; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 2362; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 2363; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 2364; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 2365; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4 2366; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 2367; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2368; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 2369; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 2370; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 2371; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 2372; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 2373; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 2374; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 2375; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 2376; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 2377; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 2378; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4 2379; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 2380; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 2381; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 2382; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 2383; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 2384; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 2385; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 2386; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2387; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 2388; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2389; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 2390; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 2391; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7] 2392; GFX10-DL-NEXT: s_endpgm 2393 ptr addrspace(1) %src2, 2394 ptr addrspace(1) nocapture %dst) { 2395entry: 2396 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2397 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 2398 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 2399 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 2400 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 2401 2402 %cvec1 = zext <8 x i4> %vec1 to <8 x i16> 2403 %cvec2 = zext <8 x i4> %vec2 to <8 x i16> 2404 2405 %mul = mul <8 x i16> %cvec1, %cvec2 2406 %mul0 = extractelement <8 x i16> %mul, i64 0 2407 %mul1 = extractelement <8 x i16> %mul, i64 1 2408 %mul2 = extractelement <8 x i16> %mul, i64 2 2409 %mul3 = extractelement <8 x i16> %mul, i64 3 2410 %mul4 = extractelement <8 x i16> %mul, i64 4 2411 %mul5 = extractelement <8 x i16> %mul, i64 5 2412 %mul6 = extractelement <8 x i16> %mul, i64 6 2413 %mul7 = extractelement <8 x i16> %mul, i64 7 2414 2415 %acc = load i16, ptr addrspace(1) %dst, align 4 2416 %add1 = add i16 %mul0, %acc 2417 %add2 = add i16 %add1, %mul1 2418 %add3 = add i16 %add2, %mul2 2419 %add4 = add i16 %add3, %mul3 2420 %add5 = add i16 %add4, %mul4 2421 %add6 = add i16 %add5, %mul5 2422 %add7 = add i16 %add6, %mul6 2423 %add8 = add i16 %add7, %mul7 2424 2425 store i16 %add8, ptr addrspace(1) %dst, align 4 2426 ret void 2427} 2428 2429; TODO: Cleanup the code to generate MAD; pattern should be recognized then. 2430define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, 2431; GFX7-LABEL: udot8_acc8_vecMul: 2432; GFX7: ; %bb.0: ; %entry 2433; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2434; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2435; GFX7-NEXT: s_mov_b32 s14, -1 2436; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 2437; GFX7-NEXT: s_add_u32 s12, s12, s11 2438; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2439; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2440; GFX7-NEXT: s_mov_b32 s3, 0xf000 2441; GFX7-NEXT: s_mov_b32 s6, 0 2442; GFX7-NEXT: s_mov_b32 s7, s3 2443; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2444; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2445; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2446; GFX7-NEXT: v_mov_b32_e32 v1, 0 2447; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2448; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 2449; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2450; GFX7-NEXT: s_mov_b32 s2, -1 2451; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 2452; GFX7-NEXT: s_addc_u32 s13, s13, 0 2453; GFX7-NEXT: s_waitcnt vmcnt(2) 2454; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 2455; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 2456; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 2457; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 2458; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 2459; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 2460; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 2461; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 2462; GFX7-NEXT: s_waitcnt vmcnt(1) 2463; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 2464; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 2465; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 2466; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 2467; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 2468; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 2469; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 2470; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 2471; GFX7-NEXT: s_waitcnt vmcnt(0) 2472; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2473; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 2474; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 2475; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 2476; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 2477; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 2478; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 2479; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 2480; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 2481; GFX7-NEXT: s_endpgm 2482; 2483; GFX8-LABEL: udot8_acc8_vecMul: 2484; GFX8: ; %bb.0: ; %entry 2485; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2486; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2487; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2488; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2489; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2490; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2491; GFX8-NEXT: v_mov_b32_e32 v1, s1 2492; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2493; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2494; GFX8-NEXT: flat_load_dword v3, v[0:1] 2495; GFX8-NEXT: v_mov_b32_e32 v1, s3 2496; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2497; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2498; GFX8-NEXT: flat_load_dword v2, v[0:1] 2499; GFX8-NEXT: v_mov_b32_e32 v0, s4 2500; GFX8-NEXT: v_mov_b32_e32 v1, s5 2501; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 2502; GFX8-NEXT: s_mov_b32 s14, -1 2503; GFX8-NEXT: s_mov_b32 s15, 0xe80000 2504; GFX8-NEXT: s_add_u32 s12, s12, s11 2505; GFX8-NEXT: s_addc_u32 s13, s13, 0 2506; GFX8-NEXT: s_waitcnt vmcnt(2) 2507; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3 2508; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4 2509; GFX8-NEXT: v_bfe_u32 v11, v3, 20, 4 2510; GFX8-NEXT: v_bfe_u32 v7, v3, 12, 4 2511; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4 2512; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4 2513; GFX8-NEXT: s_waitcnt vmcnt(1) 2514; GFX8-NEXT: v_lshrrev_b32_e32 v16, 28, v2 2515; GFX8-NEXT: v_bfe_u32 v17, v2, 24, 4 2516; GFX8-NEXT: v_bfe_u32 v18, v2, 20, 4 2517; GFX8-NEXT: v_bfe_u32 v14, v2, 12, 4 2518; GFX8-NEXT: v_bfe_u32 v15, v2, 8, 4 2519; GFX8-NEXT: v_bfe_u32 v19, v2, 16, 4 2520; GFX8-NEXT: v_mul_lo_u16_sdwa v11, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2521; GFX8-NEXT: v_mul_lo_u16_e32 v18, v10, v17 2522; GFX8-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2523; GFX8-NEXT: v_bfe_u32 v5, v3, 4, 4 2524; GFX8-NEXT: v_and_b32_e32 v6, 15, v3 2525; GFX8-NEXT: v_bfe_u32 v3, v2, 4, 4 2526; GFX8-NEXT: v_and_b32_e32 v13, 15, v2 2527; GFX8-NEXT: v_mul_lo_u16_e32 v2, v12, v19 2528; GFX8-NEXT: v_mul_lo_u16_e32 v8, v8, v15 2529; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2530; GFX8-NEXT: v_or_b32_e32 v9, v18, v9 2531; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2532; GFX8-NEXT: v_or_b32_e32 v3, v2, v11 2533; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 2534; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v9 2535; GFX8-NEXT: v_mul_lo_u16_e32 v6, v6, v13 2536; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 2537; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2538; GFX8-NEXT: v_or_b32_e32 v6, v6, v5 2539; GFX8-NEXT: v_or_b32_e32 v5, v5, v2 2540; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 2541; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] 2542; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5 2543; GFX8-NEXT: s_waitcnt vmcnt(0) 2544; GFX8-NEXT: v_add_u16_e32 v3, v6, v4 2545; GFX8-NEXT: v_add_u16_e32 v3, v3, v5 2546; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 2547; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 2548; GFX8-NEXT: v_mad_u16 v2, v12, v19, v2 2549; GFX8-NEXT: v_add_u16_e32 v2, v2, v8 2550; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v9 2551; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 2552; GFX8-NEXT: v_add_u16_e32 v2, v2, v9 2553; GFX8-NEXT: flat_store_byte v[0:1], v2 2554; GFX8-NEXT: s_endpgm 2555; 2556; GFX9-LABEL: udot8_acc8_vecMul: 2557; GFX9: ; %bb.0: ; %entry 2558; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2559; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2560; GFX9-NEXT: s_mov_b32 s14, -1 2561; GFX9-NEXT: s_mov_b32 s15, 0xe00000 2562; GFX9-NEXT: s_add_u32 s12, s12, s11 2563; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2564; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2565; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2566; GFX9-NEXT: v_mov_b32_e32 v3, 0 2567; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2568; GFX9-NEXT: global_load_dword v1, v0, s[8:9] 2569; GFX9-NEXT: global_load_dword v2, v0, s[10:11] 2570; GFX9-NEXT: global_load_ubyte v4, v3, s[0:1] 2571; GFX9-NEXT: s_addc_u32 s13, s13, 0 2572; GFX9-NEXT: s_waitcnt vmcnt(2) 2573; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4 2574; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 2575; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4 2576; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4 2577; GFX9-NEXT: v_lshrrev_b32_e32 v8, 28, v1 2578; GFX9-NEXT: v_bfe_u32 v9, v1, 24, 4 2579; GFX9-NEXT: v_bfe_u32 v10, v1, 20, 4 2580; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 4 2581; GFX9-NEXT: s_waitcnt vmcnt(1) 2582; GFX9-NEXT: v_bfe_u32 v1, v2, 4, 4 2583; GFX9-NEXT: v_and_b32_e32 v12, 15, v2 2584; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4 2585; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4 2586; GFX9-NEXT: v_lshrrev_b32_e32 v15, 28, v2 2587; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4 2588; GFX9-NEXT: v_bfe_u32 v17, v2, 20, 4 2589; GFX9-NEXT: v_bfe_u32 v2, v2, 16, 4 2590; GFX9-NEXT: v_mul_lo_u16_e32 v18, v11, v2 2591; GFX9-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2592; GFX9-NEXT: v_mul_lo_u16_e32 v17, v9, v16 2593; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2594; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14 2595; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2596; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12 2597; GFX9-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2598; GFX9-NEXT: v_or_b32_e32 v0, v18, v10 2599; GFX9-NEXT: v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2600; GFX9-NEXT: v_or_b32_e32 v6, v7, v6 2601; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2602; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 2603; GFX9-NEXT: v_or_b32_e32 v5, v5, v12 2604; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v8 2605; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 2606; GFX9-NEXT: v_or_b32_e32 v10, v12, v0 2607; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] 2608; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v10 2609; GFX9-NEXT: s_waitcnt vmcnt(0) 2610; GFX9-NEXT: v_add_u16_e32 v4, v5, v4 2611; GFX9-NEXT: v_add_u16_e32 v1, v4, v1 2612; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 2613; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 2614; GFX9-NEXT: v_mad_legacy_u16 v0, v11, v2, v0 2615; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 2616; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 2617; GFX9-NEXT: v_add_u16_e32 v0, v0, v7 2618; GFX9-NEXT: global_store_byte v3, v0, s[0:1] 2619; GFX9-NEXT: s_endpgm 2620; 2621; GFX9-DL-LABEL: udot8_acc8_vecMul: 2622; GFX9-DL: ; %bb.0: ; %entry 2623; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2624; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2625; GFX9-DL-NEXT: s_mov_b32 s14, -1 2626; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 2627; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 2628; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2629; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2630; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2631; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 2632; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2633; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] 2634; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] 2635; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[0:1] 2636; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 2637; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 2638; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 2639; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 2640; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 2641; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 2642; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 2643; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 24, 4 2644; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 2645; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 2646; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2647; GFX9-DL-NEXT: v_bfe_u32 v1, v2, 4, 4 2648; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2 2649; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4 2650; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4 2651; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2 2652; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 24, 4 2653; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 20, 4 2654; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 16, 4 2655; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v11, v2 2656; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2657; GFX9-DL-NEXT: v_mul_lo_u16_e32 v17, v9, v16 2658; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2659; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14 2660; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2661; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12 2662; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2663; GFX9-DL-NEXT: v_or_b32_e32 v0, v18, v10 2664; GFX9-DL-NEXT: v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2665; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6 2666; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2667; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 2668; GFX9-DL-NEXT: v_or_b32_e32 v5, v5, v12 2669; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v8 2670; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 2671; GFX9-DL-NEXT: v_or_b32_e32 v10, v12, v0 2672; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] 2673; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v10 2674; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2675; GFX9-DL-NEXT: v_add_u16_e32 v4, v5, v4 2676; GFX9-DL-NEXT: v_add_u16_e32 v1, v4, v1 2677; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 2678; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 2679; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v11, v2, v0 2680; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 2681; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 2682; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7 2683; GFX9-DL-NEXT: global_store_byte v3, v0, s[0:1] 2684; GFX9-DL-NEXT: s_endpgm 2685; 2686; GFX10-DL-LABEL: udot8_acc8_vecMul: 2687; GFX10-DL: ; %bb.0: ; %entry 2688; GFX10-DL-NEXT: s_clause 0x1 2689; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2690; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2691; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2692; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0 2693; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2694; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2695; GFX10-DL-NEXT: s_mov_b32 s14, -1 2696; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 2697; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 2698; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 2699; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2700; GFX10-DL-NEXT: s_clause 0x1 2701; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 2702; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 2703; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[6:7] 2704; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 2705; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 2706; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2707; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 2708; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 2709; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1 2710; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 2711; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 2712; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 2713; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 2714; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 2715; GFX10-DL-NEXT: v_bfe_u32 v1, v2, 8, 4 2716; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v9 2717; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 2718; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 2719; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 2720; GFX10-DL-NEXT: v_mul_lo_u16 v1, v7, v1 2721; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6 2722; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v2 2723; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v9 2724; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4 2725; GFX10-DL-NEXT: v_bfe_u32 v16, v2, 24, 4 2726; GFX10-DL-NEXT: v_or_b32_e32 v6, v1, v6 2727; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v15 2728; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v14 2729; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v0 2730; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v13 2731; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 2732; GFX10-DL-NEXT: v_mul_lo_u16 v1, v12, v7 2733; GFX10-DL-NEXT: v_mul_lo_u16 v11, v10, v16 2734; GFX10-DL-NEXT: v_lshlrev_b16 v2, 8, v2 2735; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v8 2736; GFX10-DL-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2737; GFX10-DL-NEXT: v_or_b32_e32 v5, v5, v9 2738; GFX10-DL-NEXT: v_or_b32_e32 v1, v1, v2 2739; GFX10-DL-NEXT: v_or_b32_sdwa v2, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2740; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v13 2741; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2742; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 2743; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2744; GFX10-DL-NEXT: v_add_nc_u16 v5, v3, v9 2745; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] 2746; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1 2747; GFX10-DL-NEXT: v_add_nc_u16 v0, v5, v6 2748; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2 2749; GFX10-DL-NEXT: v_mad_u16 v0, v12, v7, v0 2750; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 2751; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v8 2752; GFX10-DL-NEXT: v_mad_u16 v0, v10, v16, v0 2753; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 2754; GFX10-DL-NEXT: global_store_byte v4, v0, s[6:7] 2755; GFX10-DL-NEXT: s_endpgm 2756 ptr addrspace(1) %src2, 2757 ptr addrspace(1) nocapture %dst) { 2758entry: 2759 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2760 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 2761 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 2762 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 2763 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 2764 2765 %cvec1 = zext <8 x i4> %vec1 to <8 x i8> 2766 %cvec2 = zext <8 x i4> %vec2 to <8 x i8> 2767 2768 %mul = mul <8 x i8> %cvec1, %cvec2 2769 %mul0 = extractelement <8 x i8> %mul, i64 0 2770 %mul1 = extractelement <8 x i8> %mul, i64 1 2771 %mul2 = extractelement <8 x i8> %mul, i64 2 2772 %mul3 = extractelement <8 x i8> %mul, i64 3 2773 %mul4 = extractelement <8 x i8> %mul, i64 4 2774 %mul5 = extractelement <8 x i8> %mul, i64 5 2775 %mul6 = extractelement <8 x i8> %mul, i64 6 2776 %mul7 = extractelement <8 x i8> %mul, i64 7 2777 2778 %acc = load i8, ptr addrspace(1) %dst, align 4 2779 %add1 = add i8 %mul0, %acc 2780 %add2 = add i8 %add1, %mul1 2781 %add3 = add i8 %add2, %mul2 2782 %add4 = add i8 %add3, %mul3 2783 %add5 = add i8 %add4, %mul4 2784 %add6 = add i8 %add5, %mul5 2785 %add7 = add i8 %add6, %mul6 2786 %add8 = add i8 %add7, %mul7 2787 2788 store i8 %add8, ptr addrspace(1) %dst, align 4 2789 ret void 2790} 2791 2792; TODO: Once the adictional "and+add" are removed, the pattern will be recognized. 2793define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, 2794; GFX7-LABEL: udot8_acc4_vecMul: 2795; GFX7: ; %bb.0: ; %entry 2796; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2797; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2798; GFX7-NEXT: s_mov_b32 s14, -1 2799; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 2800; GFX7-NEXT: s_add_u32 s12, s12, s11 2801; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2802; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2803; GFX7-NEXT: s_mov_b32 s3, 0xf000 2804; GFX7-NEXT: s_mov_b32 s6, 0 2805; GFX7-NEXT: s_mov_b32 s7, s3 2806; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2807; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2808; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2809; GFX7-NEXT: v_mov_b32_e32 v1, 0 2810; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2811; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 2812; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2813; GFX7-NEXT: s_mov_b32 s2, -1 2814; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 2815; GFX7-NEXT: s_addc_u32 s13, s13, 0 2816; GFX7-NEXT: s_waitcnt vmcnt(2) 2817; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 2818; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 2819; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 2820; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 2821; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 2822; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 2823; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 2824; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 2825; GFX7-NEXT: s_waitcnt vmcnt(1) 2826; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 2827; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 2828; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 2829; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 2830; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 2831; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 2832; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 2833; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 2834; GFX7-NEXT: s_waitcnt vmcnt(0) 2835; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2836; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 2837; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 2838; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 2839; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 2840; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 2841; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 2842; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 2843; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 2844; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 2845; GFX7-NEXT: s_endpgm 2846; 2847; GFX8-LABEL: udot8_acc4_vecMul: 2848; GFX8: ; %bb.0: ; %entry 2849; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2850; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2851; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2852; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2853; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2854; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2855; GFX8-NEXT: v_mov_b32_e32 v1, s1 2856; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2857; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2858; GFX8-NEXT: flat_load_dword v3, v[0:1] 2859; GFX8-NEXT: v_mov_b32_e32 v1, s3 2860; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2861; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2862; GFX8-NEXT: flat_load_dword v2, v[0:1] 2863; GFX8-NEXT: v_mov_b32_e32 v0, s4 2864; GFX8-NEXT: v_mov_b32_e32 v1, s5 2865; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 2866; GFX8-NEXT: s_mov_b32 s14, -1 2867; GFX8-NEXT: s_mov_b32 s15, 0xe80000 2868; GFX8-NEXT: s_add_u32 s12, s12, s11 2869; GFX8-NEXT: s_addc_u32 s13, s13, 0 2870; GFX8-NEXT: s_waitcnt vmcnt(2) 2871; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 2872; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 2873; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 2874; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 2875; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 2876; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 2877; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 2878; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 2879; GFX8-NEXT: s_waitcnt vmcnt(1) 2880; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 2881; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 2882; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 2883; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 2884; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 2885; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 2886; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 2887; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 2888; GFX8-NEXT: s_waitcnt vmcnt(0) 2889; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 2890; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 2891; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 2892; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 2893; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 2894; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 2895; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 2896; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 2897; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 2898; GFX8-NEXT: flat_store_byte v[0:1], v2 2899; GFX8-NEXT: s_endpgm 2900; 2901; GFX9-LABEL: udot8_acc4_vecMul: 2902; GFX9: ; %bb.0: ; %entry 2903; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2904; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2905; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2906; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2907; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2908; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2909; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 2910; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 2911; GFX9-NEXT: v_mov_b32_e32 v0, 0 2912; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] 2913; GFX9-NEXT: s_mov_b32 s0, 0x5040100 2914; GFX9-NEXT: s_mov_b32 s14, -1 2915; GFX9-NEXT: s_mov_b32 s15, 0xe00000 2916; GFX9-NEXT: s_add_u32 s12, s12, s11 2917; GFX9-NEXT: s_addc_u32 s13, s13, 0 2918; GFX9-NEXT: s_waitcnt vmcnt(2) 2919; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 2920; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 2921; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4 2922; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 2923; GFX9-NEXT: s_waitcnt vmcnt(1) 2924; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 2925; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 2926; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 2927; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 2928; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 2929; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 2930; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 2931; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 2932; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 2933; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 2934; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 2935; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 2936; GFX9-NEXT: s_waitcnt vmcnt(0) 2937; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 2938; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 2939; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 2940; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 2941; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 2942; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4 2943; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 2944; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 2945; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2946; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 2947; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 2948; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 2949; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 2950; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2951; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 2952; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2953; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 2954; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2955; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 2956; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2957; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 2958; GFX9-NEXT: global_store_byte v0, v1, s[6:7] 2959; GFX9-NEXT: s_endpgm 2960; 2961; GFX9-DL-LABEL: udot8_acc4_vecMul: 2962; GFX9-DL: ; %bb.0: ; %entry 2963; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2964; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2965; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2966; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2967; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2968; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2969; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 2970; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 2971; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2972; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] 2973; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 2974; GFX9-DL-NEXT: s_mov_b32 s14, -1 2975; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 2976; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 2977; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 2978; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 2979; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 2980; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 2981; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 2982; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 2983; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2984; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 2985; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 2986; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 2987; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 2988; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 2989; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 2990; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 2991; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 2992; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 2993; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 2994; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 2995; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 2996; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2997; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 2998; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 2999; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 3000; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 3001; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 3002; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4 3003; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 3004; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 3005; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3006; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 3007; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 3008; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 3009; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 3010; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 3011; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 3012; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3013; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 3014; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3015; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 3016; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3017; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 3018; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] 3019; GFX9-DL-NEXT: s_endpgm 3020; 3021; GFX10-DL-LABEL: udot8_acc4_vecMul: 3022; GFX10-DL: ; %bb.0: ; %entry 3023; GFX10-DL-NEXT: s_clause 0x1 3024; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3025; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3026; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3027; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 3028; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 3029; GFX10-DL-NEXT: s_mov_b32 s14, -1 3030; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 3031; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 3032; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 3033; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3034; GFX10-DL-NEXT: s_clause 0x1 3035; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 3036; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 3037; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 3038; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] 3039; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 3040; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 3041; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 3042; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 3043; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 3044; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 3045; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 3046; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 3047; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 3048; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 3049; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 3050; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 3051; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 3052; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 3053; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 3054; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 3055; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 3056; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4 3057; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 3058; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 3059; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 3060; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 3061; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 3062; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 3063; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 3064; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 3065; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 3066; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 3067; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 3068; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 3069; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4 3070; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 3071; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 3072; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 3073; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 3074; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 3075; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 3076; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 3077; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 3078; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 3079; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3080; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 3081; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 3082; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 3083; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7] 3084; GFX10-DL-NEXT: s_endpgm 3085 ptr addrspace(1) %src2, 3086 ptr addrspace(1) nocapture %dst) { 3087entry: 3088 %idx = call i32 @llvm.amdgcn.workitem.id.x() 3089 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 3090 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 3091 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 3092 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 3093 3094 %mul = mul <8 x i4> %vec1, %vec2 3095 %mul0 = extractelement <8 x i4> %mul, i64 0 3096 %mul1 = extractelement <8 x i4> %mul, i64 1 3097 %mul2 = extractelement <8 x i4> %mul, i64 2 3098 %mul3 = extractelement <8 x i4> %mul, i64 3 3099 %mul4 = extractelement <8 x i4> %mul, i64 4 3100 %mul5 = extractelement <8 x i4> %mul, i64 5 3101 %mul6 = extractelement <8 x i4> %mul, i64 6 3102 %mul7 = extractelement <8 x i4> %mul, i64 7 3103 3104 %acc = load i4, ptr addrspace(1) %dst, align 4 3105 %add1 = add i4 %mul0, %acc 3106 %add2 = add i4 %add1, %mul1 3107 %add3 = add i4 %add2, %mul2 3108 %add4 = add i4 %add3, %mul3 3109 %add5 = add i4 %add4, %mul4 3110 %add6 = add i4 %add5, %mul5 3111 %add7 = add i4 %add6, %mul6 3112 %add8 = add i4 %add7, %mul7 3113 3114 store i4 %add8, ptr addrspace(1) %dst, align 4 3115 ret void 3116} 3117 3118define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, 3119; GFX7-LABEL: udot8_variant1: 3120; GFX7: ; %bb.0: ; %entry 3121; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 3122; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 3123; GFX7-NEXT: s_mov_b32 s3, 0xf000 3124; GFX7-NEXT: s_mov_b32 s6, 0 3125; GFX7-NEXT: s_mov_b32 s7, s3 3126; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3127; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 3128; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3129; GFX7-NEXT: v_mov_b32_e32 v1, 0 3130; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3131; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 3132; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 3133; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 3134; GFX7-NEXT: s_mov_b32 s2, -1 3135; GFX7-NEXT: s_waitcnt vmcnt(1) 3136; GFX7-NEXT: v_and_b32_e32 v1, 15, v2 3137; GFX7-NEXT: v_bfe_u32 v3, v2, 4, 4 3138; GFX7-NEXT: s_waitcnt vmcnt(0) 3139; GFX7-NEXT: v_and_b32_e32 v9, 15, v0 3140; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 4 3141; GFX7-NEXT: v_bfe_u32 v5, v2, 12, 4 3142; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 3143; GFX7-NEXT: v_bfe_u32 v7, v2, 20, 4 3144; GFX7-NEXT: v_bfe_u32 v8, v2, 24, 4 3145; GFX7-NEXT: v_lshrrev_b32_e32 v2, 28, v2 3146; GFX7-NEXT: v_bfe_u32 v10, v0, 4, 4 3147; GFX7-NEXT: v_bfe_u32 v11, v0, 8, 4 3148; GFX7-NEXT: v_bfe_u32 v12, v0, 12, 4 3149; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 3150; GFX7-NEXT: v_bfe_u32 v14, v0, 20, 4 3151; GFX7-NEXT: v_bfe_u32 v15, v0, 24, 4 3152; GFX7-NEXT: v_lshrrev_b32_e32 v0, 28, v0 3153; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3154; GFX7-NEXT: v_mad_u32_u24 v1, v9, v1, s4 3155; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 3156; GFX7-NEXT: v_mad_u32_u24 v0, v10, v3, v0 3157; GFX7-NEXT: v_mad_u32_u24 v0, v11, v4, v0 3158; GFX7-NEXT: v_mad_u32_u24 v0, v12, v5, v0 3159; GFX7-NEXT: v_mad_u32_u24 v0, v13, v6, v0 3160; GFX7-NEXT: v_mad_u32_u24 v0, v14, v7, v0 3161; GFX7-NEXT: v_mad_u32_u24 v0, v15, v8, v0 3162; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 3163; GFX7-NEXT: s_endpgm 3164; 3165; GFX8-LABEL: udot8_variant1: 3166; GFX8: ; %bb.0: ; %entry 3167; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3168; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 3169; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3170; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3171; GFX8-NEXT: v_mov_b32_e32 v1, s1 3172; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3173; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3174; GFX8-NEXT: flat_load_dword v3, v[0:1] 3175; GFX8-NEXT: v_mov_b32_e32 v1, s3 3176; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3177; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3178; GFX8-NEXT: flat_load_dword v0, v[0:1] 3179; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 3180; GFX8-NEXT: s_waitcnt vmcnt(1) 3181; GFX8-NEXT: v_and_b32_e32 v1, 15, v3 3182; GFX8-NEXT: v_bfe_u32 v4, v3, 4, 4 3183; GFX8-NEXT: v_bfe_u32 v6, v3, 8, 4 3184; GFX8-NEXT: v_bfe_u32 v8, v3, 12, 4 3185; GFX8-NEXT: v_bfe_u32 v10, v3, 16, 4 3186; GFX8-NEXT: v_bfe_u32 v12, v3, 20, 4 3187; GFX8-NEXT: s_waitcnt vmcnt(0) 3188; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 3189; GFX8-NEXT: v_bfe_u32 v5, v0, 4, 4 3190; GFX8-NEXT: v_bfe_u32 v7, v0, 8, 4 3191; GFX8-NEXT: v_bfe_u32 v9, v0, 12, 4 3192; GFX8-NEXT: v_bfe_u32 v11, v0, 16, 4 3193; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4 3194; GFX8-NEXT: v_bfe_u32 v14, v3, 24, 4 3195; GFX8-NEXT: v_bfe_u32 v15, v0, 24, 4 3196; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3 3197; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0 3198; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3199; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, s0 3200; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v1 3201; GFX8-NEXT: v_mad_u32_u24 v0, v5, v4, v0 3202; GFX8-NEXT: v_mad_u32_u24 v0, v7, v6, v0 3203; GFX8-NEXT: v_mad_u32_u24 v0, v9, v8, v0 3204; GFX8-NEXT: v_mad_u32_u24 v0, v11, v10, v0 3205; GFX8-NEXT: v_mad_u32_u24 v0, v13, v12, v0 3206; GFX8-NEXT: v_mad_u32_u24 v2, v15, v14, v0 3207; GFX8-NEXT: v_mov_b32_e32 v0, s4 3208; GFX8-NEXT: v_mov_b32_e32 v1, s5 3209; GFX8-NEXT: flat_store_dword v[0:1], v2 3210; GFX8-NEXT: s_endpgm 3211; 3212; GFX9-LABEL: udot8_variant1: 3213; GFX9: ; %bb.0: ; %entry 3214; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3215; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3216; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3217; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3218; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 3219; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 3220; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 3221; GFX9-NEXT: v_mov_b32_e32 v0, 0 3222; GFX9-NEXT: s_waitcnt vmcnt(1) 3223; GFX9-NEXT: v_and_b32_e32 v3, 15, v1 3224; GFX9-NEXT: s_waitcnt vmcnt(0) 3225; GFX9-NEXT: v_and_b32_e32 v4, 15, v2 3226; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 3227; GFX9-NEXT: v_bfe_u32 v6, v2, 4, 4 3228; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4 3229; GFX9-NEXT: v_bfe_u32 v8, v2, 8, 4 3230; GFX9-NEXT: v_bfe_u32 v9, v1, 12, 4 3231; GFX9-NEXT: v_bfe_u32 v10, v2, 12, 4 3232; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 4 3233; GFX9-NEXT: v_bfe_u32 v12, v2, 16, 4 3234; GFX9-NEXT: v_bfe_u32 v13, v1, 20, 4 3235; GFX9-NEXT: v_bfe_u32 v14, v2, 20, 4 3236; GFX9-NEXT: v_bfe_u32 v15, v1, 24, 4 3237; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4 3238; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 3239; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 3240; GFX9-NEXT: v_mul_u32_u24_e32 v3, v4, v3 3241; GFX9-NEXT: v_mul_u32_u24_e32 v1, v2, v1 3242; GFX9-NEXT: v_mul_u32_u24_e32 v4, v6, v5 3243; GFX9-NEXT: v_mul_u32_u24_e32 v5, v8, v7 3244; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3245; GFX9-NEXT: v_add3_u32 v1, v3, s0, v1 3246; GFX9-NEXT: v_mul_u32_u24_e32 v6, v10, v9 3247; GFX9-NEXT: v_mul_u32_u24_e32 v7, v12, v11 3248; GFX9-NEXT: v_add3_u32 v1, v1, v4, v5 3249; GFX9-NEXT: v_mul_u32_u24_e32 v8, v14, v13 3250; GFX9-NEXT: v_mul_u32_u24_e32 v9, v16, v15 3251; GFX9-NEXT: v_add3_u32 v1, v1, v6, v7 3252; GFX9-NEXT: v_add3_u32 v1, v1, v8, v9 3253; GFX9-NEXT: global_store_dword v0, v1, s[6:7] 3254; GFX9-NEXT: s_endpgm 3255; 3256; GFX9-DL-LABEL: udot8_variant1: 3257; GFX9-DL: ; %bb.0: ; %entry 3258; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3259; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3260; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3261; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3262; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 3263; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 3264; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 3265; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 3266; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3267; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0 3268; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 3269; GFX9-DL-NEXT: s_endpgm 3270; 3271; GFX10-DL-LABEL: udot8_variant1: 3272; GFX10-DL: ; %bb.0: ; %entry 3273; GFX10-DL-NEXT: s_clause 0x1 3274; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3275; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3276; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3277; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3278; GFX10-DL-NEXT: s_clause 0x1 3279; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 3280; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 3281; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 3282; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 3283; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 3284; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3285; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0 3286; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] 3287; GFX10-DL-NEXT: s_endpgm 3288 ptr addrspace(1) %v2addr, 3289 ptr addrspace(1) %dst) { 3290entry: 3291 %idx = call i32 @llvm.amdgcn.workitem.id.x() 3292 %gep1 = getelementptr i32, ptr addrspace(1) %v1addr, i32 %idx 3293 %v1 = load i32, ptr addrspace(1) %gep1, align 4 3294 %gep2 = getelementptr i32, ptr addrspace(1) %v2addr, i32 %idx 3295 %v2 = load i32, ptr addrspace(1) %gep2, align 4 3296 %and = and i32 %v1, 15 3297 %and1 = and i32 %v2, 15 3298 %mul1 = mul nuw nsw i32 %and1, %and 3299 3300 %shr = lshr i32 %v1, 4 3301 %and2 = and i32 %shr, 15 3302 %shr3 = lshr i32 %v2, 4 3303 %and4 = and i32 %shr3, 15 3304 %mul2 = mul nuw nsw i32 %and4, %and2 3305 3306 %shr6 = lshr i32 %v1, 8 3307 %and7 = and i32 %shr6, 15 3308 %shr8 = lshr i32 %v2, 8 3309 %and9 = and i32 %shr8, 15 3310 %mul3 = mul nuw nsw i32 %and9, %and7 3311 3312 %shr12 = lshr i32 %v1, 12 3313 %and13 = and i32 %shr12, 15 3314 %shr14 = lshr i32 %v2, 12 3315 %and15 = and i32 %shr14, 15 3316 %mul4 = mul nuw nsw i32 %and15, %and13 3317 3318 %shr18 = lshr i32 %v1, 16 3319 %and19 = and i32 %shr18, 15 3320 %shr20 = lshr i32 %v2, 16 3321 %and21 = and i32 %shr20, 15 3322 %mul5 = mul nuw nsw i32 %and21, %and19 3323 3324 %shr24 = lshr i32 %v1, 20 3325 %and25 = and i32 %shr24, 15 3326 %shr26 = lshr i32 %v2, 20 3327 %and27 = and i32 %shr26, 15 3328 %mul6 = mul nuw nsw i32 %and27, %and25 3329 3330 %shr30 = lshr i32 %v1, 24 3331 %and31 = and i32 %shr30, 15 3332 %shr32 = lshr i32 %v2, 24 3333 %and33 = and i32 %shr32, 15 3334 %mul7 = mul nuw nsw i32 %and33, %and31 3335 3336 %shr36 = lshr i32 %v1, 28 3337 %shr37 = lshr i32 %v2, 28 3338 %mul8 = mul nuw nsw i32 %shr37, %shr36 3339 %acc = load i32, ptr addrspace(1) %dst, align 4 3340 3341 %add1 = add i32 %mul1, %acc 3342 %add2 = add i32 %add1, %mul8 3343 %add3 = add i32 %add2, %mul2 3344 %add4 = add i32 %add3, %mul3 3345 %add5 = add i32 %add4, %mul4 3346 %add6 = add i32 %add5, %mul5 3347 %add7 = add i32 %add6, %mul6 3348 %add8 = add i32 %add7, %mul7 3349 store i32 %add8, ptr addrspace(1) %dst, align 4 3350 ret void 3351} 3352 3353declare i32 @llvm.amdgcn.workitem.id.x() 3354