1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s 8; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s 9; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s 10 11define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, 12; GFX7-LABEL: idot8_acc32: 13; GFX7: ; %bb.0: ; %entry 14; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 15; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 16; GFX7-NEXT: s_mov_b32 s14, -1 17; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 18; GFX7-NEXT: s_add_u32 s12, s12, s11 19; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 20; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 21; GFX7-NEXT: s_mov_b32 s3, 0xf000 22; GFX7-NEXT: s_mov_b32 s6, 0 23; GFX7-NEXT: s_mov_b32 s7, s3 24; GFX7-NEXT: s_waitcnt lgkmcnt(0) 25; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 26; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 27; GFX7-NEXT: v_mov_b32_e32 v1, 0 28; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 29; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 30; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 31; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 32; GFX7-NEXT: s_mov_b32 s2, -1 33; GFX7-NEXT: s_addc_u32 s13, s13, 0 34; GFX7-NEXT: s_waitcnt vmcnt(1) 35; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 36; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 37; GFX7-NEXT: s_waitcnt vmcnt(0) 38; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 39; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 40; GFX7-NEXT: s_waitcnt lgkmcnt(0) 41; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s4 42; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 43; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 44; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1 45; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 46; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 47; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1 48; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 49; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 50; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1 51; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 52; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 53; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1 54; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 55; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 56; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1 57; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 58; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 59; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1 60; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 61; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 62; GFX7-NEXT: s_endpgm 63; 64; GFX8-LABEL: idot8_acc32: 65; GFX8: ; %bb.0: ; %entry 66; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 67; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 68; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 69; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 70; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 71; GFX8-NEXT: s_waitcnt lgkmcnt(0) 72; GFX8-NEXT: v_mov_b32_e32 v1, s1 73; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 74; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 75; GFX8-NEXT: flat_load_dword v3, v[0:1] 76; GFX8-NEXT: v_mov_b32_e32 v1, s3 77; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 78; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 79; GFX8-NEXT: flat_load_dword v0, v[0:1] 80; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 81; GFX8-NEXT: s_mov_b32 s14, -1 82; GFX8-NEXT: s_mov_b32 s15, 0xe80000 83; GFX8-NEXT: s_add_u32 s12, s12, s11 84; GFX8-NEXT: s_addc_u32 s13, s13, 0 85; GFX8-NEXT: s_waitcnt vmcnt(1) 86; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 87; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 88; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4 89; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4 90; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4 91; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4 92; GFX8-NEXT: s_waitcnt vmcnt(0) 93; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4 94; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4 95; GFX8-NEXT: s_waitcnt lgkmcnt(0) 96; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 97; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4 98; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 99; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4 100; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 101; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4 102; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1 103; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4 104; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1 105; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4 106; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4 107; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1 108; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3 109; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0 110; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1 111; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 112; GFX8-NEXT: v_mov_b32_e32 v0, s4 113; GFX8-NEXT: v_mov_b32_e32 v1, s5 114; GFX8-NEXT: flat_store_dword v[0:1], v2 115; GFX8-NEXT: s_endpgm 116; 117; GFX9-LABEL: idot8_acc32: 118; GFX9: ; %bb.0: ; %entry 119; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 120; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 121; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 122; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 123; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 124; GFX9-NEXT: s_waitcnt lgkmcnt(0) 125; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 126; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 127; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 128; GFX9-NEXT: s_mov_b32 s14, -1 129; GFX9-NEXT: s_mov_b32 s15, 0xe00000 130; GFX9-NEXT: s_add_u32 s12, s12, s11 131; GFX9-NEXT: v_mov_b32_e32 v0, 0 132; GFX9-NEXT: s_addc_u32 s13, s13, 0 133; GFX9-NEXT: s_waitcnt vmcnt(1) 134; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 135; GFX9-NEXT: s_waitcnt vmcnt(0) 136; GFX9-NEXT: v_bfe_i32 v4, v2, 0, 4 137; GFX9-NEXT: v_bfe_i32 v5, v1, 4, 4 138; GFX9-NEXT: v_bfe_i32 v6, v2, 4, 4 139; GFX9-NEXT: v_bfe_i32 v7, v1, 8, 4 140; GFX9-NEXT: v_bfe_i32 v8, v2, 8, 4 141; GFX9-NEXT: v_bfe_i32 v9, v1, 12, 4 142; GFX9-NEXT: v_bfe_i32 v10, v2, 12, 4 143; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4 144; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4 145; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4 146; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4 147; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4 148; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4 149; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1 150; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2 151; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v4 152; GFX9-NEXT: v_mul_i32_i24_e32 v4, v5, v6 153; GFX9-NEXT: v_mul_i32_i24_e32 v5, v7, v8 154; GFX9-NEXT: v_mul_i32_i24_e32 v6, v9, v10 155; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 156; GFX9-NEXT: s_waitcnt lgkmcnt(0) 157; GFX9-NEXT: v_add3_u32 v2, v3, s0, v4 158; GFX9-NEXT: v_mul_i32_i24_e32 v7, v11, v12 159; GFX9-NEXT: v_mul_i32_i24_e32 v8, v13, v14 160; GFX9-NEXT: v_add3_u32 v2, v2, v5, v6 161; GFX9-NEXT: v_mul_i32_i24_e32 v9, v15, v16 162; GFX9-NEXT: v_add3_u32 v2, v2, v7, v8 163; GFX9-NEXT: v_add3_u32 v1, v2, v9, v1 164; GFX9-NEXT: global_store_dword v0, v1, s[6:7] 165; GFX9-NEXT: s_endpgm 166; 167; GFX9-DL-LABEL: idot8_acc32: 168; GFX9-DL: ; %bb.0: ; %entry 169; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 170; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 171; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 172; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 173; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 174; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 175; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 176; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 177; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 178; GFX9-DL-NEXT: s_mov_b32 s14, -1 179; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 180; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 181; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 182; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 183; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 184; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 185; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 186; GFX9-DL-NEXT: s_endpgm 187; 188; GFX10-DL-XNACK-LABEL: idot8_acc32: 189; GFX10-DL-XNACK: ; %bb.0: ; %entry 190; GFX10-DL-XNACK-NEXT: s_clause 0x1 191; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 192; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 193; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 194; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 195; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 196; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 197; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 198; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 199; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 200; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 201; GFX10-DL-XNACK-NEXT: s_clause 0x1 202; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1] 203; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[2:3] 204; GFX10-DL-XNACK-NEXT: s_waitcnt_depctr 0xffe3 205; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[6:7], 0x0 206; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 207; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 208; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 209; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[6:7] 210; GFX10-DL-XNACK-NEXT: s_endpgm 211; 212; GFX10-DL-NOXNACK-LABEL: idot8_acc32: 213; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 214; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 215; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 216; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 217; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 218; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 219; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 220; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 221; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 222; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 223; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 224; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 225; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 226; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1] 227; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[2:3] 228; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[4:5], 0x0 229; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 230; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0 231; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[4:5] 232; GFX10-DL-NOXNACK-NEXT: s_endpgm 233 ptr addrspace(1) %src2, 234 ptr addrspace(1) nocapture %dst) { 235entry: 236 %idx = call i32 @llvm.amdgcn.workitem.id.x() 237 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 238 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 239 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 240 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 241 242 %v1e0 = extractelement <8 x i4> %vec1, i64 0 243 %cv1e0 = sext i4 %v1e0 to i32 244 %v2e0 = extractelement <8 x i4> %vec2, i64 0 245 %cv2e0 = sext i4 %v2e0 to i32 246 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 247 248 %v1e1 = extractelement <8 x i4> %vec1, i64 1 249 %cv1e1 = sext i4 %v1e1 to i32 250 %v2e1 = extractelement <8 x i4> %vec2, i64 1 251 %cv2e1 = sext i4 %v2e1 to i32 252 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 253 254 %v1e2 = extractelement <8 x i4> %vec1, i64 2 255 %cv1e2 = sext i4 %v1e2 to i32 256 %v2e2 = extractelement <8 x i4> %vec2, i64 2 257 %cv2e2 = sext i4 %v2e2 to i32 258 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 259 260 %v1e3 = extractelement <8 x i4> %vec1, i64 3 261 %cv1e3 = sext i4 %v1e3 to i32 262 %v2e3 = extractelement <8 x i4> %vec2, i64 3 263 %cv2e3 = sext i4 %v2e3 to i32 264 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 265 266 %v1e4 = extractelement <8 x i4> %vec1, i64 4 267 %cv1e4 = sext i4 %v1e4 to i32 268 %v2e4 = extractelement <8 x i4> %vec2, i64 4 269 %cv2e4 = sext i4 %v2e4 to i32 270 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 271 272 %v1e5 = extractelement <8 x i4> %vec1, i64 5 273 %cv1e5 = sext i4 %v1e5 to i32 274 %v2e5 = extractelement <8 x i4> %vec2, i64 5 275 %cv2e5 = sext i4 %v2e5 to i32 276 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 277 278 %v1e6 = extractelement <8 x i4> %vec1, i64 6 279 %cv1e6 = sext i4 %v1e6 to i32 280 %v2e6 = extractelement <8 x i4> %vec2, i64 6 281 %cv2e6 = sext i4 %v2e6 to i32 282 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 283 284 %v1e7 = extractelement <8 x i4> %vec1, i64 7 285 %cv1e7 = sext i4 %v1e7 to i32 286 %v2e7 = extractelement <8 x i4> %vec2, i64 7 287 %cv2e7 = sext i4 %v2e7 to i32 288 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 289 290 %acc = load i32, ptr addrspace(1) %dst, align 4 291 %add1 = add i32 %mul0, %acc 292 %add2 = add i32 %add1, %mul1 293 %add3 = add i32 %add2, %mul2 294 %add4 = add i32 %add3, %mul3 295 %add5 = add i32 %add4, %mul4 296 %add6 = add i32 %add5, %mul5 297 %add7 = add i32 %add6, %mul6 298 %add8 = add i32 %add7, %mul7 299 300 store i32 %add8, ptr addrspace(1) %dst, align 4 301 ret void 302} 303 304; TODO: Once the unnecessary zero extentions of the elements are removed; 305; pattern recognizer will kick in. 306define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, 307; GFX7-LABEL: idot8_acc16: 308; GFX7: ; %bb.0: ; %entry 309; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 310; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 311; GFX7-NEXT: s_mov_b32 s14, -1 312; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 313; GFX7-NEXT: s_add_u32 s12, s12, s11 314; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 315; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 316; GFX7-NEXT: s_mov_b32 s3, 0xf000 317; GFX7-NEXT: s_mov_b32 s6, 0 318; GFX7-NEXT: s_mov_b32 s7, s3 319; GFX7-NEXT: s_waitcnt lgkmcnt(0) 320; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 321; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 322; GFX7-NEXT: v_mov_b32_e32 v1, 0 323; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 324; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 325; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 326; GFX7-NEXT: s_mov_b32 s2, -1 327; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 328; GFX7-NEXT: s_addc_u32 s13, s13, 0 329; GFX7-NEXT: s_waitcnt vmcnt(2) 330; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 331; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 332; GFX7-NEXT: s_waitcnt vmcnt(1) 333; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 334; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 335; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 336; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10 337; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 338; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 339; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 340; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11 341; GFX7-NEXT: s_waitcnt vmcnt(0) 342; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 343; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 344; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 345; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 346; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12 347; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 348; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 349; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 350; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 351; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13 352; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 353; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 354; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 355; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 356; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14 357; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 358; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 359; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 360; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 361; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15 362; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 363; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 364; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9 365; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 366; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16 367; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 368; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 369; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 370; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 371; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 372; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 373; GFX7-NEXT: s_endpgm 374; 375; GFX8-LABEL: idot8_acc16: 376; GFX8: ; %bb.0: ; %entry 377; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 378; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 379; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 380; GFX8-NEXT: v_mov_b32_e32 v5, 12 381; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 382; GFX8-NEXT: s_waitcnt lgkmcnt(0) 383; GFX8-NEXT: v_mov_b32_e32 v1, s1 384; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 385; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 386; GFX8-NEXT: flat_load_dword v3, v[0:1] 387; GFX8-NEXT: v_mov_b32_e32 v1, s3 388; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 389; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 390; GFX8-NEXT: flat_load_dword v2, v[0:1] 391; GFX8-NEXT: v_mov_b32_e32 v0, s4 392; GFX8-NEXT: v_mov_b32_e32 v1, s5 393; GFX8-NEXT: flat_load_ushort v4, v[0:1] 394; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 395; GFX8-NEXT: s_mov_b32 s14, -1 396; GFX8-NEXT: s_mov_b32 s15, 0xe80000 397; GFX8-NEXT: s_add_u32 s12, s12, s11 398; GFX8-NEXT: s_addc_u32 s13, s13, 0 399; GFX8-NEXT: s_waitcnt vmcnt(2) 400; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 401; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 402; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 403; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 404; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 405; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 406; GFX8-NEXT: s_waitcnt vmcnt(1) 407; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 408; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v2 409; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2 410; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2 411; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 412; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 413; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 414; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 415; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 416; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 417; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v16 418; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 419; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 420; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 421; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 422; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 423; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 424; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 425; GFX8-NEXT: s_waitcnt vmcnt(0) 426; GFX8-NEXT: v_mad_u16 v4, v5, v16, v4 427; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 428; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 429; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 430; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 431; GFX8-NEXT: v_mad_u16 v4, v10, v15, v4 432; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 433; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 434; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4 435; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v18 436; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 437; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 438; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 439; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4 440; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 441; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 442; GFX8-NEXT: v_mad_u16 v4, v17, v18, v4 443; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 444; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 445; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 446; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 447; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4 448; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 449; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 450; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 451; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2 452; GFX8-NEXT: flat_store_short v[0:1], v2 453; GFX8-NEXT: s_endpgm 454; 455; GFX9-LABEL: idot8_acc16: 456; GFX9: ; %bb.0: ; %entry 457; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 458; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 459; GFX9-NEXT: s_mov_b32 s14, -1 460; GFX9-NEXT: s_mov_b32 s15, 0xe00000 461; GFX9-NEXT: s_add_u32 s12, s12, s11 462; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 463; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 464; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 465; GFX9-NEXT: v_mov_b32_e32 v4, 12 466; GFX9-NEXT: s_waitcnt lgkmcnt(0) 467; GFX9-NEXT: global_load_dword v1, v0, s[8:9] 468; GFX9-NEXT: global_load_dword v2, v0, s[10:11] 469; GFX9-NEXT: v_mov_b32_e32 v0, 0 470; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] 471; GFX9-NEXT: s_addc_u32 s13, s13, 0 472; GFX9-NEXT: s_waitcnt vmcnt(2) 473; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 474; GFX9-NEXT: s_waitcnt vmcnt(1) 475; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 476; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 477; GFX9-NEXT: v_lshlrev_b16_e32 v16, 12, v2 478; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 479; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1 480; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 481; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 482; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 483; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2 484; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 485; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 486; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 487; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 488; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 489; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 490; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v15 491; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 492; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9 493; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 494; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 495; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 496; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 497; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 498; GFX9-NEXT: s_waitcnt vmcnt(0) 499; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 500; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 501; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 502; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 503; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 504; GFX9-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 505; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 506; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 507; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 508; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17 509; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 510; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 511; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 512; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 513; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 514; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 515; GFX9-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 516; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 517; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 518; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5 519; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 520; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 521; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 522; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 523; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 524; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 525; GFX9-NEXT: global_store_short v0, v1, s[0:1] 526; GFX9-NEXT: s_endpgm 527; 528; GFX9-DL-LABEL: idot8_acc16: 529; GFX9-DL: ; %bb.0: ; %entry 530; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 531; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 532; GFX9-DL-NEXT: s_mov_b32 s14, -1 533; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 534; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 535; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 536; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 537; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 538; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 539; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 540; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] 541; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] 542; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 543; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] 544; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 545; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 546; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 547; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 548; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 549; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 550; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2 551; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 552; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 553; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 554; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 555; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 556; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 557; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 558; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 559; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 560; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 561; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 562; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 563; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15 564; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 565; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9 566; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 567; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 568; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 569; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 570; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 571; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 572; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 573; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 574; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 575; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 576; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 577; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 578; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 579; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 580; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 581; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 582; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 583; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 584; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 585; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 586; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 587; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 588; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 589; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 590; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 591; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5 592; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 593; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 594; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 595; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 596; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 597; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 598; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] 599; GFX9-DL-NEXT: s_endpgm 600; 601; GFX10-DL-XNACK-LABEL: idot8_acc16: 602; GFX10-DL-XNACK: ; %bb.0: ; %entry 603; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 604; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 605; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 606; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 607; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 608; GFX10-DL-XNACK-NEXT: s_clause 0x1 609; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 610; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 611; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 612; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 613; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 614; GFX10-DL-XNACK-NEXT: s_clause 0x1 615; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9] 616; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[10:11] 617; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 618; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] 619; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) 620; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 621; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 622; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 623; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 624; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 625; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 626; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 627; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 628; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 629; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 630; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 631; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 632; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 633; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 634; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 635; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 636; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 637; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 638; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 639; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 640; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 641; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 642; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 643; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 644; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 645; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 646; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 647; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 648; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 649; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 650; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 651; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1 652; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 653; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 654; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 655; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 656; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 657; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 658; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 659; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 660; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 661; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 662; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 663; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 664; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 665; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 666; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 667; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 668; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 669; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 670; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 671; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 672; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 673; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 674; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 675; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1 676; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1] 677; GFX10-DL-XNACK-NEXT: s_endpgm 678; 679; GFX10-DL-NOXNACK-LABEL: idot8_acc16: 680; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 681; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 682; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 683; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 684; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 685; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 686; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 687; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 688; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 689; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 690; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 691; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 692; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 693; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 694; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] 695; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11] 696; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] 697; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) 698; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 699; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 700; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 701; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 702; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 703; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 704; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 705; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 706; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 707; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 708; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 709; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 710; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 711; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 712; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 713; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 714; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 715; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 716; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 717; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 718; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 719; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 720; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 721; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 722; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 723; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 724; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 725; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 726; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 727; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 728; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 729; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 730; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 731; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 732; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 733; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 734; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 735; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 736; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 737; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 738; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 739; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 740; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 741; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 742; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 743; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 744; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 745; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 746; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 747; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 748; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 749; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 750; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 751; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 752; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 753; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 754; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1] 755; GFX10-DL-NOXNACK-NEXT: s_endpgm 756 ptr addrspace(1) %src2, 757 ptr addrspace(1) nocapture %dst) { 758entry: 759 %idx = call i32 @llvm.amdgcn.workitem.id.x() 760 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 761 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 762 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 763 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 764 765 %v1e0 = extractelement <8 x i4> %vec1, i64 0 766 %cv1e0 = sext i4 %v1e0 to i16 767 %v2e0 = extractelement <8 x i4> %vec2, i64 0 768 %cv2e0 = sext i4 %v2e0 to i16 769 %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0 770 771 %v1e1 = extractelement <8 x i4> %vec1, i64 1 772 %cv1e1 = sext i4 %v1e1 to i16 773 %v2e1 = extractelement <8 x i4> %vec2, i64 1 774 %cv2e1 = sext i4 %v2e1 to i16 775 %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1 776 777 %v1e2 = extractelement <8 x i4> %vec1, i64 2 778 %cv1e2 = sext i4 %v1e2 to i16 779 %v2e2 = extractelement <8 x i4> %vec2, i64 2 780 %cv2e2 = sext i4 %v2e2 to i16 781 %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2 782 783 %v1e3 = extractelement <8 x i4> %vec1, i64 3 784 %cv1e3 = sext i4 %v1e3 to i16 785 %v2e3 = extractelement <8 x i4> %vec2, i64 3 786 %cv2e3 = sext i4 %v2e3 to i16 787 %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3 788 789 %v1e4 = extractelement <8 x i4> %vec1, i64 4 790 %cv1e4 = sext i4 %v1e4 to i16 791 %v2e4 = extractelement <8 x i4> %vec2, i64 4 792 %cv2e4 = sext i4 %v2e4 to i16 793 %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4 794 795 %v1e5 = extractelement <8 x i4> %vec1, i64 5 796 %cv1e5 = sext i4 %v1e5 to i16 797 %v2e5 = extractelement <8 x i4> %vec2, i64 5 798 %cv2e5 = sext i4 %v2e5 to i16 799 %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5 800 801 %v1e6 = extractelement <8 x i4> %vec1, i64 6 802 %cv1e6 = sext i4 %v1e6 to i16 803 %v2e6 = extractelement <8 x i4> %vec2, i64 6 804 %cv2e6 = sext i4 %v2e6 to i16 805 %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6 806 807 %v1e7 = extractelement <8 x i4> %vec1, i64 7 808 %cv1e7 = sext i4 %v1e7 to i16 809 %v2e7 = extractelement <8 x i4> %vec2, i64 7 810 %cv2e7 = sext i4 %v2e7 to i16 811 %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7 812 813 %acc = load i16, ptr addrspace(1) %dst, align 4 814 %add1 = add i16 %mul0, %acc 815 %add2 = add i16 %add1, %mul1 816 %add3 = add i16 %add2, %mul2 817 %add4 = add i16 %add3, %mul3 818 %add5 = add i16 %add4, %mul4 819 %add6 = add i16 %add5, %mul5 820 %add7 = add i16 %add6, %mul6 821 %add8 = add i16 %add7, %mul7 822 823 store i16 %add8, ptr addrspace(1) %dst, align 4 824 ret void 825} 826 827; TODO: Support this pattern. 828define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, 829; GFX7-LABEL: idot8_acc8: 830; GFX7: ; %bb.0: ; %entry 831; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 832; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 833; GFX7-NEXT: s_mov_b32 s14, -1 834; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 835; GFX7-NEXT: s_add_u32 s12, s12, s11 836; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 837; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 838; GFX7-NEXT: s_mov_b32 s3, 0xf000 839; GFX7-NEXT: s_mov_b32 s6, 0 840; GFX7-NEXT: s_mov_b32 s7, s3 841; GFX7-NEXT: s_waitcnt lgkmcnt(0) 842; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 843; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 844; GFX7-NEXT: v_mov_b32_e32 v1, 0 845; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 846; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 847; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 848; GFX7-NEXT: s_mov_b32 s2, -1 849; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 850; GFX7-NEXT: s_addc_u32 s13, s13, 0 851; GFX7-NEXT: s_waitcnt vmcnt(2) 852; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 853; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 854; GFX7-NEXT: s_waitcnt vmcnt(1) 855; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 856; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 857; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 858; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10 859; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 860; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 861; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 862; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 863; GFX7-NEXT: s_waitcnt vmcnt(0) 864; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 865; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 866; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 867; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 868; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12 869; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 870; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 871; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 872; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 873; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13 874; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 875; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 876; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7 877; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 878; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14 879; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 880; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 881; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 882; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 883; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15 884; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 885; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 886; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9 887; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 888; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16 889; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 890; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 891; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 892; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 893; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 894; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 895; GFX7-NEXT: s_endpgm 896; 897; GFX8-LABEL: idot8_acc8: 898; GFX8: ; %bb.0: ; %entry 899; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 900; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 901; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 902; GFX8-NEXT: v_mov_b32_e32 v5, 12 903; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 904; GFX8-NEXT: s_waitcnt lgkmcnt(0) 905; GFX8-NEXT: v_mov_b32_e32 v1, s1 906; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 907; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 908; GFX8-NEXT: flat_load_dword v3, v[0:1] 909; GFX8-NEXT: v_mov_b32_e32 v1, s3 910; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 911; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 912; GFX8-NEXT: flat_load_dword v2, v[0:1] 913; GFX8-NEXT: v_mov_b32_e32 v0, s4 914; GFX8-NEXT: v_mov_b32_e32 v1, s5 915; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 916; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 917; GFX8-NEXT: s_mov_b32 s14, -1 918; GFX8-NEXT: s_mov_b32 s15, 0xe80000 919; GFX8-NEXT: s_add_u32 s12, s12, s11 920; GFX8-NEXT: s_addc_u32 s13, s13, 0 921; GFX8-NEXT: s_waitcnt vmcnt(2) 922; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 923; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 924; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 925; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 926; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 927; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 928; GFX8-NEXT: s_waitcnt vmcnt(1) 929; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 930; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v2 931; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2 932; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2 933; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 934; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 935; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 936; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 937; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 938; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 939; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v16 940; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 941; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 942; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 943; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 944; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 945; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 946; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 947; GFX8-NEXT: s_waitcnt vmcnt(0) 948; GFX8-NEXT: v_mad_u16 v4, v5, v16, v4 949; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 950; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 951; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 952; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 953; GFX8-NEXT: v_mad_u16 v4, v10, v15, v4 954; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 955; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 956; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4 957; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v18 958; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 959; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 960; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 961; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4 962; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 963; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 964; GFX8-NEXT: v_mad_u16 v4, v17, v18, v4 965; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 966; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 967; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 968; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 969; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4 970; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 971; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 972; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 973; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2 974; GFX8-NEXT: flat_store_byte v[0:1], v2 975; GFX8-NEXT: s_endpgm 976; 977; GFX9-LABEL: idot8_acc8: 978; GFX9: ; %bb.0: ; %entry 979; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 980; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 981; GFX9-NEXT: s_mov_b32 s14, -1 982; GFX9-NEXT: s_mov_b32 s15, 0xe00000 983; GFX9-NEXT: s_add_u32 s12, s12, s11 984; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 985; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 986; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 987; GFX9-NEXT: v_mov_b32_e32 v4, 12 988; GFX9-NEXT: s_waitcnt lgkmcnt(0) 989; GFX9-NEXT: global_load_dword v1, v0, s[8:9] 990; GFX9-NEXT: global_load_dword v2, v0, s[10:11] 991; GFX9-NEXT: v_mov_b32_e32 v0, 0 992; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] 993; GFX9-NEXT: s_addc_u32 s13, s13, 0 994; GFX9-NEXT: s_waitcnt vmcnt(2) 995; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 996; GFX9-NEXT: s_waitcnt vmcnt(1) 997; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 998; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 999; GFX9-NEXT: v_lshlrev_b16_e32 v16, 12, v2 1000; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 1001; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1 1002; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 1003; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 1004; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 1005; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2 1006; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 1007; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 1008; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1009; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1010; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1011; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1012; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v15 1013; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 1014; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9 1015; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 1016; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 1017; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 1018; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 1019; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 1020; GFX9-NEXT: s_waitcnt vmcnt(0) 1021; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 1022; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 1023; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 1024; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 1025; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 1026; GFX9-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 1027; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 1028; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 1029; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 1030; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17 1031; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 1032; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 1033; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 1034; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 1035; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 1036; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 1037; GFX9-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 1038; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 1039; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 1040; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5 1041; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 1042; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 1043; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 1044; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 1045; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1046; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 1047; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 1048; GFX9-NEXT: s_endpgm 1049; 1050; GFX9-DL-LABEL: idot8_acc8: 1051; GFX9-DL: ; %bb.0: ; %entry 1052; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1053; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1054; GFX9-DL-NEXT: s_mov_b32 s14, -1 1055; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 1056; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 1057; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 1058; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1059; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1060; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 1061; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1062; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] 1063; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] 1064; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1065; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] 1066; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 1067; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 1068; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 1069; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1070; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 1071; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 1072; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2 1073; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 1074; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 1075; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 1076; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 1077; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 1078; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 1079; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 1080; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 1081; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1082; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1083; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1084; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 1085; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15 1086; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 1087; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9 1088; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 1089; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 1090; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 1091; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 1092; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 1093; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1094; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3 1095; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 1096; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 1097; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 1098; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 1099; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3 1100; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 1101; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 1102; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 1103; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 1104; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 1105; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 1106; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 1107; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 1108; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 1109; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 1110; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 1111; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 1112; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 1113; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5 1114; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 1115; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 1116; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 1117; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 1118; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 1119; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 1120; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 1121; GFX9-DL-NEXT: s_endpgm 1122; 1123; GFX10-DL-XNACK-LABEL: idot8_acc8: 1124; GFX10-DL-XNACK: ; %bb.0: ; %entry 1125; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1126; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1127; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 1128; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 1129; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 1130; GFX10-DL-XNACK-NEXT: s_clause 0x1 1131; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 1132; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1133; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1134; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 1135; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 1136; GFX10-DL-XNACK-NEXT: s_clause 0x1 1137; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9] 1138; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[10:11] 1139; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 1140; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v0, s[0:1] 1141; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) 1142; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 1143; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 1144; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 1145; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 1146; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 1147; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 1148; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 1149; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 1150; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 1151; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 1152; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 1153; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 1154; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 1155; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 1156; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 1157; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 1158; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 1159; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 1160; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 1161; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 1162; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 1163; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 1164; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 1165; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 1166; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 1167; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 1168; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 1169; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 1170; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 1171; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 1172; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 1173; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1 1174; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 1175; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 1176; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 1177; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 1178; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 1179; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 1180; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 1181; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 1182; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 1183; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 1184; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 1185; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 1186; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 1187; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 1188; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 1189; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 1190; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 1191; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 1192; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 1193; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 1194; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 1195; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 1196; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 1197; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1 1198; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[0:1] 1199; GFX10-DL-XNACK-NEXT: s_endpgm 1200; 1201; GFX10-DL-NOXNACK-LABEL: idot8_acc8: 1202; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 1203; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1204; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1205; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 1206; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 1207; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 1208; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 1209; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 1210; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1211; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1212; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 1213; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 1214; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 1215; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 1216; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] 1217; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11] 1218; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[0:1] 1219; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) 1220; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 1221; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1 1222; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 20, v1 1223; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v1 1224; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 1225; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 1226; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 1227; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 1228; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 1229; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 1230; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 1231; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 1232; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 1233; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 1234; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 1235; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 1236; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 1237; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 1238; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 1239; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 1240; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 1241; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 1242; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 1243; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 1244; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 1245; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 1246; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 1247; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 1248; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 1249; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 1250; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 1251; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 1252; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 1253; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 1254; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 1255; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 1256; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 1257; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 1258; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 1259; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 1260; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 1261; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 1262; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 1263; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 1264; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 1265; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 1266; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 1267; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 1268; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 1269; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 1270; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 1271; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 1272; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 1273; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 1274; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 1275; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 1276; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1] 1277; GFX10-DL-NOXNACK-NEXT: s_endpgm 1278 ptr addrspace(1) %src2, 1279 ptr addrspace(1) nocapture %dst) { 1280entry: 1281 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1282 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 1283 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 1284 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 1285 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 1286 1287 %v1e0 = extractelement <8 x i4> %vec1, i64 0 1288 %cv1e0 = sext i4 %v1e0 to i8 1289 %v2e0 = extractelement <8 x i4> %vec2, i64 0 1290 %cv2e0 = sext i4 %v2e0 to i8 1291 %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0 1292 1293 %v1e1 = extractelement <8 x i4> %vec1, i64 1 1294 %cv1e1 = sext i4 %v1e1 to i8 1295 %v2e1 = extractelement <8 x i4> %vec2, i64 1 1296 %cv2e1 = sext i4 %v2e1 to i8 1297 %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1 1298 1299 %v1e2 = extractelement <8 x i4> %vec1, i64 2 1300 %cv1e2 = sext i4 %v1e2 to i8 1301 %v2e2 = extractelement <8 x i4> %vec2, i64 2 1302 %cv2e2 = sext i4 %v2e2 to i8 1303 %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2 1304 1305 %v1e3 = extractelement <8 x i4> %vec1, i64 3 1306 %cv1e3 = sext i4 %v1e3 to i8 1307 %v2e3 = extractelement <8 x i4> %vec2, i64 3 1308 %cv2e3 = sext i4 %v2e3 to i8 1309 %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3 1310 1311 %v1e4 = extractelement <8 x i4> %vec1, i64 4 1312 %cv1e4 = sext i4 %v1e4 to i8 1313 %v2e4 = extractelement <8 x i4> %vec2, i64 4 1314 %cv2e4 = sext i4 %v2e4 to i8 1315 %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4 1316 1317 %v1e5 = extractelement <8 x i4> %vec1, i64 5 1318 %cv1e5 = sext i4 %v1e5 to i8 1319 %v2e5 = extractelement <8 x i4> %vec2, i64 5 1320 %cv2e5 = sext i4 %v2e5 to i8 1321 %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5 1322 1323 %v1e6 = extractelement <8 x i4> %vec1, i64 6 1324 %cv1e6 = sext i4 %v1e6 to i8 1325 %v2e6 = extractelement <8 x i4> %vec2, i64 6 1326 %cv2e6 = sext i4 %v2e6 to i8 1327 %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6 1328 1329 %v1e7 = extractelement <8 x i4> %vec1, i64 7 1330 %cv1e7 = sext i4 %v1e7 to i8 1331 %v2e7 = extractelement <8 x i4> %vec2, i64 7 1332 %cv2e7 = sext i4 %v2e7 to i8 1333 %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7 1334 1335 %acc = load i8, ptr addrspace(1) %dst, align 4 1336 %add1 = add i8 %mul0, %acc 1337 %add2 = add i8 %add1, %mul1 1338 %add3 = add i8 %add2, %mul2 1339 %add4 = add i8 %add3, %mul3 1340 %add5 = add i8 %add4, %mul4 1341 %add6 = add i8 %add5, %mul5 1342 %add7 = add i8 %add6, %mul6 1343 %add8 = add i8 %add7, %mul7 1344 1345 store i8 %add8, ptr addrspace(1) %dst, align 4 1346 ret void 1347} 1348 1349; Make sure the pattern is not recognized if there are multiple uses of the 1350; intermediate multiplications. 1351define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, 1352; GFX7-LABEL: idot8_multiuses_mul1: 1353; GFX7: ; %bb.0: ; %entry 1354; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1355; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1356; GFX7-NEXT: s_mov_b32 s14, -1 1357; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 1358; GFX7-NEXT: s_add_u32 s12, s12, s11 1359; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1360; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1361; GFX7-NEXT: s_mov_b32 s3, 0xf000 1362; GFX7-NEXT: s_mov_b32 s6, 0 1363; GFX7-NEXT: s_mov_b32 s7, s3 1364; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1365; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 1366; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1367; GFX7-NEXT: v_mov_b32_e32 v1, 0 1368; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1369; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 1370; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1371; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1372; GFX7-NEXT: s_mov_b32 s2, -1 1373; GFX7-NEXT: s_addc_u32 s13, s13, 0 1374; GFX7-NEXT: s_waitcnt vmcnt(1) 1375; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 1376; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 1377; GFX7-NEXT: s_waitcnt vmcnt(0) 1378; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 1379; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1380; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s4 1381; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 1382; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, v16 1383; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 1384; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 1385; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1 1386; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 1387; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 1388; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1 1389; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 1390; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 1391; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1 1392; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 1393; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 1394; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1 1395; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 1396; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 1397; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1 1398; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 1399; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 1400; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1 1401; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 1402; GFX7-NEXT: v_add_i32_e32 v0, vcc, v16, v0 1403; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1404; GFX7-NEXT: s_endpgm 1405; 1406; GFX8-LABEL: idot8_multiuses_mul1: 1407; GFX8: ; %bb.0: ; %entry 1408; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1409; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1410; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1411; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1412; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1413; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1414; GFX8-NEXT: v_mov_b32_e32 v1, s1 1415; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1416; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1417; GFX8-NEXT: flat_load_dword v3, v[0:1] 1418; GFX8-NEXT: v_mov_b32_e32 v1, s3 1419; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1420; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1421; GFX8-NEXT: flat_load_dword v0, v[0:1] 1422; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1423; GFX8-NEXT: s_mov_b32 s14, -1 1424; GFX8-NEXT: s_mov_b32 s15, 0xe80000 1425; GFX8-NEXT: s_add_u32 s12, s12, s11 1426; GFX8-NEXT: s_addc_u32 s13, s13, 0 1427; GFX8-NEXT: s_waitcnt vmcnt(1) 1428; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 1429; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 1430; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4 1431; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4 1432; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4 1433; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4 1434; GFX8-NEXT: s_waitcnt vmcnt(0) 1435; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4 1436; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1437; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s0 1438; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4 1439; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v16 1440; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4 1441; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 1442; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4 1443; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 1444; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4 1445; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1 1446; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4 1447; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1 1448; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4 1449; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4 1450; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1 1451; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3 1452; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0 1453; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1 1454; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, v1 1455; GFX8-NEXT: v_add_u32_e32 v2, vcc, v16, v0 1456; GFX8-NEXT: v_mov_b32_e32 v0, s4 1457; GFX8-NEXT: v_mov_b32_e32 v1, s5 1458; GFX8-NEXT: flat_store_dword v[0:1], v2 1459; GFX8-NEXT: s_endpgm 1460; 1461; GFX9-LABEL: idot8_multiuses_mul1: 1462; GFX9: ; %bb.0: ; %entry 1463; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1464; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1465; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1466; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1467; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1468; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1469; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 1470; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 1471; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 1472; GFX9-NEXT: s_mov_b32 s14, -1 1473; GFX9-NEXT: s_mov_b32 s15, 0xe00000 1474; GFX9-NEXT: s_add_u32 s12, s12, s11 1475; GFX9-NEXT: v_mov_b32_e32 v0, 0 1476; GFX9-NEXT: s_addc_u32 s13, s13, 0 1477; GFX9-NEXT: s_waitcnt vmcnt(1) 1478; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 1479; GFX9-NEXT: s_waitcnt vmcnt(0) 1480; GFX9-NEXT: v_bfe_i32 v4, v2, 0, 4 1481; GFX9-NEXT: v_bfe_i32 v5, v1, 4, 4 1482; GFX9-NEXT: v_bfe_i32 v6, v2, 4, 4 1483; GFX9-NEXT: v_bfe_i32 v7, v1, 8, 4 1484; GFX9-NEXT: v_bfe_i32 v8, v2, 8, 4 1485; GFX9-NEXT: v_bfe_i32 v9, v1, 12, 4 1486; GFX9-NEXT: v_bfe_i32 v10, v2, 12, 4 1487; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4 1488; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4 1489; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4 1490; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4 1491; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4 1492; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4 1493; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1 1494; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2 1495; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 1496; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1497; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s0 1498; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v6 1499; GFX9-NEXT: v_mul_i32_i24_e32 v6, v7, v8 1500; GFX9-NEXT: v_mad_i32_i24 v3, v3, v4, v2 1501; GFX9-NEXT: v_mul_i32_i24_e32 v7, v9, v10 1502; GFX9-NEXT: v_mul_i32_i24_e32 v8, v11, v12 1503; GFX9-NEXT: v_add3_u32 v3, v3, v5, v6 1504; GFX9-NEXT: v_mul_i32_i24_e32 v9, v13, v14 1505; GFX9-NEXT: v_mul_i32_i24_e32 v10, v15, v16 1506; GFX9-NEXT: v_add3_u32 v3, v3, v7, v8 1507; GFX9-NEXT: v_add3_u32 v3, v3, v9, v10 1508; GFX9-NEXT: v_add3_u32 v1, v3, v1, v2 1509; GFX9-NEXT: global_store_dword v0, v1, s[6:7] 1510; GFX9-NEXT: s_endpgm 1511; 1512; GFX9-DL-LABEL: idot8_multiuses_mul1: 1513; GFX9-DL: ; %bb.0: ; %entry 1514; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1515; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1516; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1517; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1518; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1519; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1520; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1521; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1522; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1523; GFX9-DL-NEXT: s_mov_b32 s14, -1 1524; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 1525; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 1526; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1527; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 1528; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1529; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 4 1530; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1531; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 4 1532; GFX9-DL-NEXT: v_bfe_i32 v5, v1, 4, 4 1533; GFX9-DL-NEXT: v_bfe_i32 v6, v2, 4, 4 1534; GFX9-DL-NEXT: v_bfe_i32 v7, v1, 8, 4 1535; GFX9-DL-NEXT: v_bfe_i32 v8, v2, 8, 4 1536; GFX9-DL-NEXT: v_bfe_i32 v9, v1, 12, 4 1537; GFX9-DL-NEXT: v_bfe_i32 v10, v2, 12, 4 1538; GFX9-DL-NEXT: v_bfe_i32 v11, v1, 16, 4 1539; GFX9-DL-NEXT: v_bfe_i32 v12, v2, 16, 4 1540; GFX9-DL-NEXT: v_bfe_i32 v13, v1, 20, 4 1541; GFX9-DL-NEXT: v_bfe_i32 v14, v2, 20, 4 1542; GFX9-DL-NEXT: v_bfe_i32 v15, v1, 24, 4 1543; GFX9-DL-NEXT: v_bfe_i32 v16, v2, 24, 4 1544; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 28, v1 1545; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v2 1546; GFX9-DL-NEXT: v_mul_i32_i24_e32 v1, v1, v2 1547; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1548; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 1549; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, v5, v6 1550; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, v7, v8 1551; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, v2 1552; GFX9-DL-NEXT: v_mul_i32_i24_e32 v7, v9, v10 1553; GFX9-DL-NEXT: v_mul_i32_i24_e32 v8, v11, v12 1554; GFX9-DL-NEXT: v_add3_u32 v3, v3, v5, v6 1555; GFX9-DL-NEXT: v_mul_i32_i24_e32 v9, v13, v14 1556; GFX9-DL-NEXT: v_mul_i32_i24_e32 v10, v15, v16 1557; GFX9-DL-NEXT: v_add3_u32 v3, v3, v7, v8 1558; GFX9-DL-NEXT: v_add3_u32 v3, v3, v9, v10 1559; GFX9-DL-NEXT: v_add3_u32 v1, v3, v1, v2 1560; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1561; GFX9-DL-NEXT: s_endpgm 1562; 1563; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1: 1564; GFX10-DL-XNACK: ; %bb.0: ; %entry 1565; GFX10-DL-XNACK-NEXT: s_clause 0x1 1566; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1567; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1568; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1569; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1570; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1571; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 1572; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 1573; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 1574; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 1575; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 1576; GFX10-DL-XNACK-NEXT: s_clause 0x1 1577; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1] 1578; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[2:3] 1579; GFX10-DL-XNACK-NEXT: s_waitcnt_depctr 0xffe3 1580; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[6:7], 0x0 1581; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 1582; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v1, 0, 4 1583; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 4, 4 1584; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 1585; GFX10-DL-XNACK-NEXT: v_bfe_i32 v4, v2, 4, 4 1586; GFX10-DL-XNACK-NEXT: v_bfe_i32 v5, v1, 8, 4 1587; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v2, 8, 4 1588; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v2, 0, 4 1589; GFX10-DL-XNACK-NEXT: v_bfe_i32 v8, v1, 12, 4 1590; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4 1591; GFX10-DL-XNACK-NEXT: v_bfe_i32 v9, v2, 12, 4 1592; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 1593; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 1594; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s0 1595; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 1596; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4 1597; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 1598; GFX10-DL-XNACK-NEXT: v_bfe_i32 v12, v2, 20, 4 1599; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, v0, v7, v5 1600; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v1, 24, 4 1601; GFX10-DL-XNACK-NEXT: v_bfe_i32 v13, v2, 24, 4 1602; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9 1603; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10 1604; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 1605; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12 1606; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13 1607; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1 1608; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v2, 28, v2 1609; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v8, v6 1610; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v1, v1, v2 1611; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 1612; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0 1613; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5 1614; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[6:7] 1615; GFX10-DL-XNACK-NEXT: s_endpgm 1616; 1617; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1: 1618; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 1619; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1620; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1621; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1622; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1623; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1624; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 1625; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 1626; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 1627; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 1628; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 1629; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 1630; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1] 1631; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[2:3] 1632; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[4:5], 0x0 1633; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 1634; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v2, v1, 0, 4 1635; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v1, 4, 4 1636; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 1637; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v4, v0, 4, 4 1638; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v5, v1, 8, 4 1639; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v0, 8, 4 1640; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v0, 0, 4 1641; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v8, v1, 12, 4 1642; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v3, v4 1643; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v0, 12, 4 1644; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 1645; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 1646; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s0 1647; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 1648; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4 1649; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 1650; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v12, v0, 20, 4 1651; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, v2, v7, v5 1652; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v1, 24, 4 1653; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v13, v0, 24, 4 1654; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9 1655; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10 1656; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v3, v4 1657; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12 1658; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13 1659; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1 1660; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v0, 28, v0 1661; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v8, v6 1662; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v0, v1, v0 1663; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4 1664; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 1665; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5 1666; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[4:5] 1667; GFX10-DL-NOXNACK-NEXT: s_endpgm 1668 ptr addrspace(1) %src2, 1669 ptr addrspace(1) nocapture %dst) { 1670entry: 1671 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1672 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 1673 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 1674 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 1675 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 1676 1677 %v1e0 = extractelement <8 x i4> %vec1, i64 0 1678 %cv1e0 = sext i4 %v1e0 to i32 1679 %v2e0 = extractelement <8 x i4> %vec2, i64 0 1680 %cv2e0 = sext i4 %v2e0 to i32 1681 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 1682 1683 %v1e1 = extractelement <8 x i4> %vec1, i64 1 1684 %cv1e1 = sext i4 %v1e1 to i32 1685 %v2e1 = extractelement <8 x i4> %vec2, i64 1 1686 %cv2e1 = sext i4 %v2e1 to i32 1687 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 1688 1689 %v1e2 = extractelement <8 x i4> %vec1, i64 2 1690 %cv1e2 = sext i4 %v1e2 to i32 1691 %v2e2 = extractelement <8 x i4> %vec2, i64 2 1692 %cv2e2 = sext i4 %v2e2 to i32 1693 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 1694 1695 %v1e3 = extractelement <8 x i4> %vec1, i64 3 1696 %cv1e3 = sext i4 %v1e3 to i32 1697 %v2e3 = extractelement <8 x i4> %vec2, i64 3 1698 %cv2e3 = sext i4 %v2e3 to i32 1699 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 1700 1701 %v1e4 = extractelement <8 x i4> %vec1, i64 4 1702 %cv1e4 = sext i4 %v1e4 to i32 1703 %v2e4 = extractelement <8 x i4> %vec2, i64 4 1704 %cv2e4 = sext i4 %v2e4 to i32 1705 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 1706 1707 %v1e5 = extractelement <8 x i4> %vec1, i64 5 1708 %cv1e5 = sext i4 %v1e5 to i32 1709 %v2e5 = extractelement <8 x i4> %vec2, i64 5 1710 %cv2e5 = sext i4 %v2e5 to i32 1711 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 1712 1713 %v1e6 = extractelement <8 x i4> %vec1, i64 6 1714 %cv1e6 = sext i4 %v1e6 to i32 1715 %v2e6 = extractelement <8 x i4> %vec2, i64 6 1716 %cv2e6 = sext i4 %v2e6 to i32 1717 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 1718 1719 %v1e7 = extractelement <8 x i4> %vec1, i64 7 1720 %cv1e7 = sext i4 %v1e7 to i32 1721 %v2e7 = extractelement <8 x i4> %vec2, i64 7 1722 %cv2e7 = sext i4 %v2e7 to i32 1723 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 1724 1725 %acc = load i32, ptr addrspace(1) %dst, align 4 1726 %add = add i32 %mul0, %acc 1727 %add1 = add i32 %mul0, %add 1728 %add2 = add i32 %add1, %mul1 1729 %add3 = add i32 %add2, %mul2 1730 %add4 = add i32 %add3, %mul3 1731 %add5 = add i32 %add4, %mul4 1732 %add6 = add i32 %add5, %mul5 1733 %add7 = add i32 %add6, %mul6 1734 %add8 = add i32 %add7, %mul7 1735 1736 %res = add i32 %add, %add8 1737 store i32 %res, ptr addrspace(1) %dst, align 4 1738 ret void 1739} 1740 1741; TODO: Support this pattern. 1742define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, 1743; GFX7-LABEL: idot8_acc32_vecMul: 1744; GFX7: ; %bb.0: ; %entry 1745; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1746; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1747; GFX7-NEXT: s_mov_b32 s14, -1 1748; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 1749; GFX7-NEXT: s_add_u32 s12, s12, s11 1750; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1751; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1752; GFX7-NEXT: s_mov_b32 s3, 0xf000 1753; GFX7-NEXT: s_mov_b32 s6, 0 1754; GFX7-NEXT: s_mov_b32 s7, s3 1755; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1756; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 1757; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1758; GFX7-NEXT: v_mov_b32_e32 v1, 0 1759; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1760; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 1761; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1762; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1763; GFX7-NEXT: s_mov_b32 s2, -1 1764; GFX7-NEXT: s_addc_u32 s13, s13, 0 1765; GFX7-NEXT: s_waitcnt vmcnt(1) 1766; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v2 1767; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 1768; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 1769; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 1770; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 1771; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4 1772; GFX7-NEXT: v_bfe_i32 v8, v2, 4, 4 1773; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 1774; GFX7-NEXT: s_waitcnt vmcnt(0) 1775; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v0 1776; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 1777; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 1778; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4 1779; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 1780; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4 1781; GFX7-NEXT: v_bfe_i32 v15, v0, 4, 4 1782; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4 1783; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1784; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4 1785; GFX7-NEXT: v_mad_i32_i24 v0, v8, v15, v0 1786; GFX7-NEXT: v_mad_i32_i24 v0, v7, v14, v0 1787; GFX7-NEXT: v_mad_i32_i24 v0, v6, v13, v0 1788; GFX7-NEXT: v_mad_i32_i24 v0, v5, v12, v0 1789; GFX7-NEXT: v_mad_i32_i24 v0, v4, v11, v0 1790; GFX7-NEXT: v_mad_i32_i24 v0, v3, v10, v0 1791; GFX7-NEXT: v_mad_i32_i24 v0, v1, v9, v0 1792; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1793; GFX7-NEXT: s_endpgm 1794; 1795; GFX8-LABEL: idot8_acc32_vecMul: 1796; GFX8: ; %bb.0: ; %entry 1797; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1798; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1799; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1800; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1801; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1802; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1803; GFX8-NEXT: v_mov_b32_e32 v1, s1 1804; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1805; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1806; GFX8-NEXT: flat_load_dword v3, v[0:1] 1807; GFX8-NEXT: v_mov_b32_e32 v1, s3 1808; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1809; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1810; GFX8-NEXT: flat_load_dword v0, v[0:1] 1811; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1812; GFX8-NEXT: s_mov_b32 s14, -1 1813; GFX8-NEXT: s_mov_b32 s15, 0xe80000 1814; GFX8-NEXT: s_add_u32 s12, s12, s11 1815; GFX8-NEXT: s_addc_u32 s13, s13, 0 1816; GFX8-NEXT: s_waitcnt vmcnt(1) 1817; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3 1818; GFX8-NEXT: v_bfe_i32 v2, v3, 24, 4 1819; GFX8-NEXT: v_bfe_i32 v4, v3, 20, 4 1820; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 4 1821; GFX8-NEXT: v_bfe_i32 v6, v3, 12, 4 1822; GFX8-NEXT: v_bfe_i32 v7, v3, 8, 4 1823; GFX8-NEXT: v_bfe_i32 v8, v3, 4, 4 1824; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 4 1825; GFX8-NEXT: s_waitcnt vmcnt(0) 1826; GFX8-NEXT: v_ashrrev_i32_e32 v9, 28, v0 1827; GFX8-NEXT: v_bfe_i32 v10, v0, 24, 4 1828; GFX8-NEXT: v_bfe_i32 v11, v0, 20, 4 1829; GFX8-NEXT: v_bfe_i32 v12, v0, 16, 4 1830; GFX8-NEXT: v_bfe_i32 v13, v0, 12, 4 1831; GFX8-NEXT: v_bfe_i32 v14, v0, 8, 4 1832; GFX8-NEXT: v_bfe_i32 v15, v0, 4, 4 1833; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 4 1834; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1835; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s0 1836; GFX8-NEXT: v_mad_i32_i24 v0, v8, v15, v0 1837; GFX8-NEXT: v_mad_i32_i24 v0, v7, v14, v0 1838; GFX8-NEXT: v_mad_i32_i24 v0, v6, v13, v0 1839; GFX8-NEXT: v_mad_i32_i24 v0, v5, v12, v0 1840; GFX8-NEXT: v_mad_i32_i24 v0, v4, v11, v0 1841; GFX8-NEXT: v_mad_i32_i24 v0, v2, v10, v0 1842; GFX8-NEXT: v_mad_i32_i24 v2, v1, v9, v0 1843; GFX8-NEXT: v_mov_b32_e32 v0, s4 1844; GFX8-NEXT: v_mov_b32_e32 v1, s5 1845; GFX8-NEXT: flat_store_dword v[0:1], v2 1846; GFX8-NEXT: s_endpgm 1847; 1848; GFX9-LABEL: idot8_acc32_vecMul: 1849; GFX9: ; %bb.0: ; %entry 1850; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1851; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1852; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1853; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1854; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1855; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1856; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 1857; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 1858; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 1859; GFX9-NEXT: s_mov_b32 s14, -1 1860; GFX9-NEXT: s_mov_b32 s15, 0xe00000 1861; GFX9-NEXT: s_add_u32 s12, s12, s11 1862; GFX9-NEXT: v_mov_b32_e32 v0, 0 1863; GFX9-NEXT: s_addc_u32 s13, s13, 0 1864; GFX9-NEXT: s_waitcnt vmcnt(1) 1865; GFX9-NEXT: v_ashrrev_i32_e32 v3, 28, v1 1866; GFX9-NEXT: v_bfe_i32 v4, v1, 24, 4 1867; GFX9-NEXT: v_bfe_i32 v5, v1, 20, 4 1868; GFX9-NEXT: v_bfe_i32 v6, v1, 16, 4 1869; GFX9-NEXT: v_bfe_i32 v7, v1, 12, 4 1870; GFX9-NEXT: v_bfe_i32 v8, v1, 8, 4 1871; GFX9-NEXT: v_bfe_i32 v9, v1, 4, 4 1872; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 4 1873; GFX9-NEXT: s_waitcnt vmcnt(0) 1874; GFX9-NEXT: v_ashrrev_i32_e32 v10, 28, v2 1875; GFX9-NEXT: v_bfe_i32 v11, v2, 24, 4 1876; GFX9-NEXT: v_bfe_i32 v12, v2, 20, 4 1877; GFX9-NEXT: v_bfe_i32 v13, v2, 16, 4 1878; GFX9-NEXT: v_bfe_i32 v14, v2, 12, 4 1879; GFX9-NEXT: v_bfe_i32 v15, v2, 8, 4 1880; GFX9-NEXT: v_bfe_i32 v16, v2, 4, 4 1881; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 4 1882; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 1883; GFX9-NEXT: v_mul_i32_i24_e32 v2, v9, v16 1884; GFX9-NEXT: v_mul_i32_i24_e32 v8, v8, v15 1885; GFX9-NEXT: v_mul_i32_i24_e32 v7, v7, v14 1886; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1887; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 1888; GFX9-NEXT: v_mul_i32_i24_e32 v6, v6, v13 1889; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v12 1890; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 1891; GFX9-NEXT: v_mul_i32_i24_e32 v4, v4, v11 1892; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v10 1893; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 1894; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 1895; GFX9-NEXT: global_store_dword v0, v1, s[6:7] 1896; GFX9-NEXT: s_endpgm 1897; 1898; GFX9-DL-LABEL: idot8_acc32_vecMul: 1899; GFX9-DL: ; %bb.0: ; %entry 1900; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1901; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1902; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1903; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1904; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1905; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1906; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1907; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1908; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1909; GFX9-DL-NEXT: s_mov_b32 s14, -1 1910; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 1911; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 1912; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1913; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 1914; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1915; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 1916; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1917; GFX9-DL-NEXT: s_endpgm 1918; 1919; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul: 1920; GFX10-DL-XNACK: ; %bb.0: ; %entry 1921; GFX10-DL-XNACK-NEXT: s_clause 0x1 1922; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1923; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1924; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1925; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1926; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1927; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 1928; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 1929; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 1930; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 1931; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 1932; GFX10-DL-XNACK-NEXT: s_clause 0x1 1933; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1] 1934; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[2:3] 1935; GFX10-DL-XNACK-NEXT: s_waitcnt_depctr 0xffe3 1936; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[6:7], 0x0 1937; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 1938; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1939; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 1940; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[6:7] 1941; GFX10-DL-XNACK-NEXT: s_endpgm 1942; 1943; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul: 1944; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 1945; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1946; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1947; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1948; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 1949; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 1950; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 1951; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 1952; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 1953; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 1954; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 1955; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 1956; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 1957; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1] 1958; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[2:3] 1959; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[4:5], 0x0 1960; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1961; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0 1962; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[4:5] 1963; GFX10-DL-NOXNACK-NEXT: s_endpgm 1964 ptr addrspace(1) %src2, 1965 ptr addrspace(1) nocapture %dst) { 1966entry: 1967 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1968 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 1969 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 1970 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 1971 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 1972 1973 %cvec1 = sext <8 x i4> %vec1 to <8 x i32> 1974 %cvec2 = sext <8 x i4> %vec2 to <8 x i32> 1975 1976 %mul = mul <8 x i32> %cvec1, %cvec2 1977 %mul0 = extractelement <8 x i32> %mul, i64 0 1978 %mul1 = extractelement <8 x i32> %mul, i64 1 1979 %mul2 = extractelement <8 x i32> %mul, i64 2 1980 %mul3 = extractelement <8 x i32> %mul, i64 3 1981 %mul4 = extractelement <8 x i32> %mul, i64 4 1982 %mul5 = extractelement <8 x i32> %mul, i64 5 1983 %mul6 = extractelement <8 x i32> %mul, i64 6 1984 %mul7 = extractelement <8 x i32> %mul, i64 7 1985 1986 %acc = load i32, ptr addrspace(1) %dst, align 4 1987 %add1 = add i32 %mul0, %acc 1988 %add2 = add i32 %add1, %mul1 1989 %add3 = add i32 %add2, %mul2 1990 %add4 = add i32 %add3, %mul3 1991 %add5 = add i32 %add4, %mul4 1992 %add6 = add i32 %add5, %mul5 1993 %add7 = add i32 %add6, %mul6 1994 %add8 = add i32 %add7, %mul7 1995 1996 store i32 %add8, ptr addrspace(1) %dst, align 4 1997 ret void 1998} 1999 2000; TODO: Support this pattern. 2001define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, 2002; GFX7-LABEL: idot8_acc16_vecMul: 2003; GFX7: ; %bb.0: ; %entry 2004; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2005; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2006; GFX7-NEXT: s_mov_b32 s14, -1 2007; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 2008; GFX7-NEXT: s_add_u32 s12, s12, s11 2009; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2010; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2011; GFX7-NEXT: s_mov_b32 s3, 0xf000 2012; GFX7-NEXT: s_mov_b32 s6, 0 2013; GFX7-NEXT: s_mov_b32 s7, s3 2014; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2015; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2016; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2017; GFX7-NEXT: v_mov_b32_e32 v1, 0 2018; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2019; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 2020; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2021; GFX7-NEXT: s_mov_b32 s2, -1 2022; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 2023; GFX7-NEXT: s_addc_u32 s13, s13, 0 2024; GFX7-NEXT: s_waitcnt vmcnt(2) 2025; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4 2026; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 2027; GFX7-NEXT: s_waitcnt vmcnt(1) 2028; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4 2029; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 4 2030; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 2031; GFX7-NEXT: v_ashrrev_i32_e32 v7, 28, v2 2032; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 2033; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4 2034; GFX7-NEXT: v_bfe_i32 v2, v2, 4, 4 2035; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 2036; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 2037; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4 2038; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 2039; GFX7-NEXT: v_ashrrev_i32_e32 v14, 28, v0 2040; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 2041; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4 2042; GFX7-NEXT: v_bfe_i32 v0, v0, 4, 4 2043; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13 2044; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 2045; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 2046; GFX7-NEXT: s_waitcnt vmcnt(0) 2047; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 2048; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 2049; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12 2050; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2051; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9 2052; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16 2053; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 2054; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 2055; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11 2056; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 2057; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 2058; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15 2059; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 2060; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 2061; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10 2062; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 2063; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 2064; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14 2065; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 2066; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 2067; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 2068; GFX7-NEXT: s_endpgm 2069; 2070; GFX8-LABEL: idot8_acc16_vecMul: 2071; GFX8: ; %bb.0: ; %entry 2072; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2073; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2074; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2075; GFX8-NEXT: v_mov_b32_e32 v5, 12 2076; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2077; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2078; GFX8-NEXT: v_mov_b32_e32 v1, s1 2079; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2080; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2081; GFX8-NEXT: flat_load_dword v3, v[0:1] 2082; GFX8-NEXT: v_mov_b32_e32 v1, s3 2083; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2084; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2085; GFX8-NEXT: flat_load_dword v2, v[0:1] 2086; GFX8-NEXT: v_mov_b32_e32 v0, s4 2087; GFX8-NEXT: v_mov_b32_e32 v1, s5 2088; GFX8-NEXT: flat_load_ushort v4, v[0:1] 2089; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2090; GFX8-NEXT: s_mov_b32 s14, -1 2091; GFX8-NEXT: s_mov_b32 s15, 0xe80000 2092; GFX8-NEXT: s_add_u32 s12, s12, s11 2093; GFX8-NEXT: s_addc_u32 s13, s13, 0 2094; GFX8-NEXT: s_waitcnt vmcnt(2) 2095; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 2096; GFX8-NEXT: v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2097; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v3 2098; GFX8-NEXT: v_lshlrev_b16_sdwa v9, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2099; GFX8-NEXT: v_lshrrev_b32_e32 v10, 12, v3 2100; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 2101; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v3 2102; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3 2103; GFX8-NEXT: s_waitcnt vmcnt(1) 2104; GFX8-NEXT: v_lshrrev_b32_e32 v13, 28, v2 2105; GFX8-NEXT: v_lshlrev_b16_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2106; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2 2107; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2108; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2 2109; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2 2110; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2 2111; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v2 2112; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 2113; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 2114; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v18 2115; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2116; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 2117; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v17 2118; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 2119; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 2120; GFX8-NEXT: s_waitcnt vmcnt(0) 2121; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 2122; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 2123; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v16 2124; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 2125; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 2126; GFX8-NEXT: v_mad_u16 v2, v12, v18, v2 2127; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 2128; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 2129; GFX8-NEXT: v_mad_u16 v2, v11, v17, v2 2130; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 2131; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 2132; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 2133; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 2134; GFX8-NEXT: v_mad_u16 v2, v10, v16, v2 2135; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2136; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 2137; GFX8-NEXT: v_mad_u16 v2, v9, v5, v2 2138; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 2139; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2140; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 2141; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 2142; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 2143; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 2144; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 2145; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 2146; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 2147; GFX8-NEXT: flat_store_short v[0:1], v2 2148; GFX8-NEXT: s_endpgm 2149; 2150; GFX9-LABEL: idot8_acc16_vecMul: 2151; GFX9: ; %bb.0: ; %entry 2152; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2153; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2154; GFX9-NEXT: s_mov_b32 s14, -1 2155; GFX9-NEXT: s_mov_b32 s15, 0xe00000 2156; GFX9-NEXT: s_add_u32 s12, s12, s11 2157; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2158; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2159; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2160; GFX9-NEXT: v_mov_b32_e32 v4, 12 2161; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2162; GFX9-NEXT: global_load_dword v1, v0, s[8:9] 2163; GFX9-NEXT: global_load_dword v2, v0, s[10:11] 2164; GFX9-NEXT: v_mov_b32_e32 v0, 0 2165; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] 2166; GFX9-NEXT: s_mov_b32 s2, 0x5040100 2167; GFX9-NEXT: s_addc_u32 s13, s13, 0 2168; GFX9-NEXT: s_waitcnt vmcnt(2) 2169; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v1 2170; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 2171; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1 2172; GFX9-NEXT: s_waitcnt vmcnt(1) 2173; GFX9-NEXT: v_lshrrev_b32_e32 v13, 4, v2 2174; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v1 2175; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v2 2176; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 2177; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 2178; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 2179; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 2180; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2181; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v1 2182; GFX9-NEXT: v_lshlrev_b16_sdwa v11, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2183; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 2184; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v2 2185; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2 2186; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2187; GFX9-NEXT: v_lshrrev_b32_e32 v17, 20, v2 2188; GFX9-NEXT: v_lshlrev_b16_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2189; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 2190; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 2191; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 2192; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 2193; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2194; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2195; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 2196; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 2197; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v1 2198; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 2199; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v15 2200; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v17 2201; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v2 2202; GFX9-NEXT: v_perm_b32 v7, v8, v7, s2 2203; GFX9-NEXT: v_perm_b32 v8, v13, v12, s2 2204; GFX9-NEXT: v_perm_b32 v5, v6, v5, s2 2205; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 2206; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 2207; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16 2208; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 2209; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 2210; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 2211; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 2212; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15 2213; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 2214; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2215; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v8 2216; GFX9-NEXT: v_perm_b32 v2, v2, v4, s2 2217; GFX9-NEXT: v_perm_b32 v1, v1, v11, s2 2218; GFX9-NEXT: v_perm_b32 v4, v17, v16, s2 2219; GFX9-NEXT: v_perm_b32 v9, v10, v9, s2 2220; GFX9-NEXT: v_perm_b32 v10, v15, v14, s2 2221; GFX9-NEXT: s_waitcnt vmcnt(0) 2222; GFX9-NEXT: v_add_u16_e32 v3, v5, v3 2223; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2224; GFX9-NEXT: v_pk_mul_lo_u16 v2, v9, v4 2225; GFX9-NEXT: v_pk_mul_lo_u16 v4, v7, v10 2226; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2227; GFX9-NEXT: v_add_u16_e32 v3, v3, v4 2228; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2229; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 2230; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2231; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 2232; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2233; GFX9-NEXT: global_store_short v0, v1, s[0:1] 2234; GFX9-NEXT: s_endpgm 2235; 2236; GFX9-DL-LABEL: idot8_acc16_vecMul: 2237; GFX9-DL: ; %bb.0: ; %entry 2238; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2239; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2240; GFX9-DL-NEXT: s_mov_b32 s14, -1 2241; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 2242; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 2243; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2244; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2245; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2246; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 2247; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2248; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] 2249; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] 2250; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2251; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] 2252; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 2253; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 2254; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 2255; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v1 2256; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 2257; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 2258; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2259; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 4, v2 2260; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v1 2261; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v2 2262; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 2263; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 2264; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 2265; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 2266; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2267; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v1 2268; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v11, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2269; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 2270; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 8, v2 2271; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2 2272; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2273; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 20, v2 2274; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2275; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 2276; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 2277; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 2278; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 2279; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2280; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2281; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 2282; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 2283; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v1 2284; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 2285; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v15 2286; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v17 2287; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v2 2288; GFX9-DL-NEXT: v_perm_b32 v7, v8, v7, s2 2289; GFX9-DL-NEXT: v_perm_b32 v8, v13, v12, s2 2290; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s2 2291; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 2292; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 2293; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16 2294; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 2295; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 2296; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 2297; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 2298; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15 2299; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 2300; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2301; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v8 2302; GFX9-DL-NEXT: v_perm_b32 v2, v2, v4, s2 2303; GFX9-DL-NEXT: v_perm_b32 v1, v1, v11, s2 2304; GFX9-DL-NEXT: v_perm_b32 v4, v17, v16, s2 2305; GFX9-DL-NEXT: v_perm_b32 v9, v10, v9, s2 2306; GFX9-DL-NEXT: v_perm_b32 v10, v15, v14, s2 2307; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2308; GFX9-DL-NEXT: v_add_u16_e32 v3, v5, v3 2309; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2310; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v9, v4 2311; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v7, v10 2312; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2313; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v4 2314; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2315; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 2316; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2317; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 2318; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2319; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] 2320; GFX9-DL-NEXT: s_endpgm 2321; 2322; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul: 2323; GFX10-DL-XNACK: ; %bb.0: ; %entry 2324; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2325; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2326; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 2327; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 2328; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 2329; GFX10-DL-XNACK-NEXT: s_clause 0x1 2330; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2331; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2332; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2333; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 2334; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 2335; GFX10-DL-XNACK-NEXT: s_clause 0x1 2336; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9] 2337; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[10:11] 2338; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 2339; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] 2340; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) 2341; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 2342; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 2343; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v2 2344; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v1 2345; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v2 2346; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 8, v1 2347; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 2348; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 2349; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 2350; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v1 2351; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v2 2352; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v2 2353; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 2354; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 2355; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 2356; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 2357; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 2358; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 2359; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 2360; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 2361; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 2362; GFX10-DL-XNACK-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 2363; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 2364; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 2365; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 16, v2 2366; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v2 2367; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 2368; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 2369; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v8 2370; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v13 2371; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v14 2372; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 2373; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 2374; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 2375; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 2376; GFX10-DL-XNACK-NEXT: v_perm_b32 v8, v12, v8, 0x5040100 2377; GFX10-DL-XNACK-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 2378; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 2379; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 2380; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v4, v3 2381; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v1 2382; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1 2383; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 24, v2 2384; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 2385; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v15 2386; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 28, v2 2387; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v16 2388; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v6, v6, v8 2389; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 2390; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 2391; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 2392; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v17 2393; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 2394; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v4, v11, 0x5040100 2395; GFX10-DL-XNACK-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 2396; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v6 2397; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v6 2398; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 2399; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v12 2400; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 2401; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 2402; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v5, v4 2403; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 2404; GFX10-DL-XNACK-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 2405; GFX10-DL-XNACK-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 2406; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 2407; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v4 2408; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2 2409; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v2, v3, v5 2410; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2411; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v2, v1 2412; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 2413; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1] 2414; GFX10-DL-XNACK-NEXT: s_endpgm 2415; 2416; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul: 2417; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 2418; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2419; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2420; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 2421; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 2422; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 2423; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 2424; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2425; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2426; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2427; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 2428; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 2429; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 2430; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 2431; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] 2432; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11] 2433; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] 2434; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) 2435; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 2436; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 2437; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v0 2438; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v1 2439; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v0 2440; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 8, v1 2441; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 2442; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 2443; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 2444; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v1 2445; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v0 2446; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v0 2447; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 2448; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 2449; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 2450; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 2451; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 2452; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 2453; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 2454; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 2455; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 2456; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 2457; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 2458; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 2459; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 16, v0 2460; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v0 2461; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 2462; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 2463; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v8 2464; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v13 2465; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v14 2466; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 2467; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 2468; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 2469; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 2470; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v8, v12, v8, 0x5040100 2471; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 2472; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 2473; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 2474; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v4, v3 2475; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v1 2476; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1 2477; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 24, v0 2478; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 2479; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v15 2480; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 28, v0 2481; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v16 2482; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v6, v6, v8 2483; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 2484; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 2485; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 2486; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v17 2487; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 2488; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v4, v11, 0x5040100 2489; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 2490; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v6 2491; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v6 2492; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 2493; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v12 2494; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 2495; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 2496; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v5, v4 2497; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 2498; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v0, v0, v6, 0x5040100 2499; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 2500; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 2501; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v4 2502; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0 2503; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v3, v5 2504; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v0 2505; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0 2506; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3 2507; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1] 2508; GFX10-DL-NOXNACK-NEXT: s_endpgm 2509 ptr addrspace(1) %src2, 2510 ptr addrspace(1) nocapture %dst) { 2511entry: 2512 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2513 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 2514 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 2515 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 2516 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 2517 2518 %cvec1 = sext <8 x i4> %vec1 to <8 x i16> 2519 %cvec2 = sext <8 x i4> %vec2 to <8 x i16> 2520 2521 %mul = mul <8 x i16> %cvec1, %cvec2 2522 %mul0 = extractelement <8 x i16> %mul, i64 0 2523 %mul1 = extractelement <8 x i16> %mul, i64 1 2524 %mul2 = extractelement <8 x i16> %mul, i64 2 2525 %mul3 = extractelement <8 x i16> %mul, i64 3 2526 %mul4 = extractelement <8 x i16> %mul, i64 4 2527 %mul5 = extractelement <8 x i16> %mul, i64 5 2528 %mul6 = extractelement <8 x i16> %mul, i64 6 2529 %mul7 = extractelement <8 x i16> %mul, i64 7 2530 2531 %acc = load i16, ptr addrspace(1) %dst, align 4 2532 %add1 = add i16 %mul0, %acc 2533 %add2 = add i16 %add1, %mul1 2534 %add3 = add i16 %add2, %mul2 2535 %add4 = add i16 %add3, %mul3 2536 %add5 = add i16 %add4, %mul4 2537 %add6 = add i16 %add5, %mul5 2538 %add7 = add i16 %add6, %mul6 2539 %add8 = add i16 %add7, %mul7 2540 2541 store i16 %add8, ptr addrspace(1) %dst, align 4 2542 ret void 2543} 2544 2545; TODO: Support this pattern. 2546define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, 2547; GFX7-LABEL: idot8_acc8_vecMul: 2548; GFX7: ; %bb.0: ; %entry 2549; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2550; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2551; GFX7-NEXT: s_mov_b32 s14, -1 2552; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 2553; GFX7-NEXT: s_add_u32 s12, s12, s11 2554; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2555; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2556; GFX7-NEXT: s_mov_b32 s3, 0xf000 2557; GFX7-NEXT: s_mov_b32 s6, 0 2558; GFX7-NEXT: s_mov_b32 s7, s3 2559; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2560; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2561; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2562; GFX7-NEXT: v_mov_b32_e32 v1, 0 2563; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2564; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 2565; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2566; GFX7-NEXT: s_mov_b32 s2, -1 2567; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 2568; GFX7-NEXT: s_addc_u32 s13, s13, 0 2569; GFX7-NEXT: s_waitcnt vmcnt(2) 2570; GFX7-NEXT: v_bfe_i32 v7, v2, 0, 4 2571; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 2572; GFX7-NEXT: s_waitcnt vmcnt(1) 2573; GFX7-NEXT: v_bfe_i32 v14, v0, 0, 4 2574; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 2575; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 2576; GFX7-NEXT: v_bfe_i32 v6, v2, 8, 4 2577; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2 2578; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4 2579; GFX7-NEXT: v_bfe_i32 v2, v2, 4, 4 2580; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7 2581; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 2582; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 2583; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4 2584; GFX7-NEXT: v_bfe_i32 v13, v0, 8, 4 2585; GFX7-NEXT: v_ashrrev_i32_e32 v15, 28, v0 2586; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4 2587; GFX7-NEXT: v_bfe_i32 v0, v0, 4, 4 2588; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14 2589; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 2590; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 2591; GFX7-NEXT: s_waitcnt vmcnt(0) 2592; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 2593; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 2594; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13 2595; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2596; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9 2597; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16 2598; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 2599; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 2600; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12 2601; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 2602; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 2603; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 2604; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 2605; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 2606; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10 2607; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 2608; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 2609; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15 2610; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 2611; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 2612; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 2613; GFX7-NEXT: s_endpgm 2614; 2615; GFX8-LABEL: idot8_acc8_vecMul: 2616; GFX8: ; %bb.0: ; %entry 2617; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2618; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2619; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2620; GFX8-NEXT: v_mov_b32_e32 v5, 12 2621; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2622; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2623; GFX8-NEXT: v_mov_b32_e32 v1, s1 2624; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2625; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2626; GFX8-NEXT: flat_load_dword v3, v[0:1] 2627; GFX8-NEXT: v_mov_b32_e32 v1, s3 2628; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2629; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2630; GFX8-NEXT: flat_load_dword v2, v[0:1] 2631; GFX8-NEXT: v_mov_b32_e32 v0, s4 2632; GFX8-NEXT: v_mov_b32_e32 v1, s5 2633; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 2634; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2635; GFX8-NEXT: s_mov_b32 s14, -1 2636; GFX8-NEXT: s_mov_b32 s15, 0xe80000 2637; GFX8-NEXT: s_add_u32 s12, s12, s11 2638; GFX8-NEXT: s_addc_u32 s13, s13, 0 2639; GFX8-NEXT: s_waitcnt vmcnt(2) 2640; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3 2641; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3 2642; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 2643; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 2644; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 2645; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 2646; GFX8-NEXT: s_waitcnt vmcnt(1) 2647; GFX8-NEXT: v_lshrrev_b32_e32 v11, 20, v2 2648; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 2649; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 2650; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 2651; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 2652; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2653; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2654; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v2 2655; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2656; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2657; GFX8-NEXT: v_lshlrev_b16_e32 v5, 12, v10 2658; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v16 2659; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 2660; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 2661; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v3 2662; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v6 2663; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v15 2664; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v18 2665; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 2666; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 2667; GFX8-NEXT: v_ashrrev_i16_e32 v19, 12, v2 2668; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v11 2669; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 2670; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 2671; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 2672; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 2673; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2674; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 2675; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 2676; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2677; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 2678; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2679; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14 2680; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 2681; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 2682; GFX8-NEXT: v_mul_lo_u16_e32 v15, v16, v18 2683; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2684; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2685; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 2686; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 2687; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19 2688; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2689; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11 2690; GFX8-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2691; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2692; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2693; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2694; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 2695; GFX8-NEXT: v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2696; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 2697; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 2698; GFX8-NEXT: v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2699; GFX8-NEXT: v_or_b32_e32 v5, v5, v2 2700; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 2701; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] 2702; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5 2703; GFX8-NEXT: s_waitcnt vmcnt(0) 2704; GFX8-NEXT: v_add_u16_e32 v3, v8, v4 2705; GFX8-NEXT: v_add_u16_e32 v3, v3, v5 2706; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 2707; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 2708; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2 2709; GFX8-NEXT: v_add_u16_e32 v2, v2, v6 2710; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2 2711; GFX8-NEXT: v_add_u16_e32 v2, v2, v10 2712; GFX8-NEXT: flat_store_byte v[0:1], v2 2713; GFX8-NEXT: s_endpgm 2714; 2715; GFX9-LABEL: idot8_acc8_vecMul: 2716; GFX9: ; %bb.0: ; %entry 2717; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2718; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2719; GFX9-NEXT: s_mov_b32 s14, -1 2720; GFX9-NEXT: s_mov_b32 s15, 0xe00000 2721; GFX9-NEXT: s_add_u32 s12, s12, s11 2722; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2723; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2724; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2725; GFX9-NEXT: v_mov_b32_e32 v4, 12 2726; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2727; GFX9-NEXT: global_load_dword v1, v0, s[8:9] 2728; GFX9-NEXT: global_load_dword v2, v0, s[10:11] 2729; GFX9-NEXT: v_mov_b32_e32 v0, 0 2730; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] 2731; GFX9-NEXT: s_addc_u32 s13, s13, 0 2732; GFX9-NEXT: s_waitcnt vmcnt(2) 2733; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 2734; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 2735; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 2736; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 2737; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 2738; GFX9-NEXT: s_waitcnt vmcnt(1) 2739; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v2 2740; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 2741; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 2742; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 2743; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 2744; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 2745; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2746; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2747; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v2 2748; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2749; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2750; GFX9-NEXT: v_lshlrev_b16_e32 v4, 12, v9 2751; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15 2752; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 2753; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 2754; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 2755; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 2756; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v1 2757; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v5 2758; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v14 2759; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v17 2760; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 2761; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 2762; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 2763; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 2764; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v2 2765; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v10 2766; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2767; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2768; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 2769; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 2770; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13 2771; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 2772; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 2773; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2774; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 2775; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 2776; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18 2777; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17 2778; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2779; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2780; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2781; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10 2782; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2783; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2784; GFX9-NEXT: v_or_b32_sdwa v5, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2785; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2786; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14 2787; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 2788; GFX9-NEXT: v_or_b32_sdwa v2, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2789; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 2790; GFX9-NEXT: v_or_b32_sdwa v7, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2791; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 2792; GFX9-NEXT: v_or_b32_e32 v4, v4, v1 2793; GFX9-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] 2794; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v4 2795; GFX9-NEXT: s_waitcnt vmcnt(0) 2796; GFX9-NEXT: v_add_u16_e32 v3, v7, v3 2797; GFX9-NEXT: v_add_u16_e32 v2, v3, v2 2798; GFX9-NEXT: v_add_u16_e32 v2, v2, v6 2799; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 2800; GFX9-NEXT: v_mad_legacy_u16 v1, v16, v18, v1 2801; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 2802; GFX9-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 2803; GFX9-NEXT: v_add_u16_e32 v1, v1, v8 2804; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 2805; GFX9-NEXT: s_endpgm 2806; 2807; GFX9-DL-LABEL: idot8_acc8_vecMul: 2808; GFX9-DL: ; %bb.0: ; %entry 2809; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2810; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2811; GFX9-DL-NEXT: s_mov_b32 s14, -1 2812; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 2813; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 2814; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2815; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2816; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2817; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 2818; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2819; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] 2820; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] 2821; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2822; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] 2823; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 2824; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 2825; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 2826; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 2827; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 2828; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 2829; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 2830; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2831; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v2 2832; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 2833; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 2834; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 2835; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 2836; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 2837; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2838; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2839; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v2 2840; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2841; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2842; GFX9-DL-NEXT: v_lshlrev_b16_e32 v4, 12, v9 2843; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15 2844; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 2845; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 2846; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 2847; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 2848; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v1 2849; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v5 2850; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v14 2851; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v17 2852; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 2853; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 2854; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 2855; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 2856; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v2 2857; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v10 2858; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 2859; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 2860; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 2861; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 2862; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13 2863; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 2864; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 2865; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 2866; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 2867; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 2868; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18 2869; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17 2870; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2871; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2872; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2873; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10 2874; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2875; GFX9-DL-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2876; GFX9-DL-NEXT: v_or_b32_sdwa v5, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2877; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2878; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14 2879; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 2880; GFX9-DL-NEXT: v_or_b32_sdwa v2, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2881; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 16, v6 2882; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2883; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 2884; GFX9-DL-NEXT: v_or_b32_e32 v4, v4, v1 2885; GFX9-DL-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] 2886; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4 2887; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2888; GFX9-DL-NEXT: v_add_u16_e32 v3, v7, v3 2889; GFX9-DL-NEXT: v_add_u16_e32 v2, v3, v2 2890; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v6 2891; GFX9-DL-NEXT: v_add_u16_e32 v1, v2, v1 2892; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v16, v18, v1 2893; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 2894; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 2895; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 2896; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 2897; GFX9-DL-NEXT: s_endpgm 2898; 2899; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul: 2900; GFX10-DL-XNACK: ; %bb.0: ; %entry 2901; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2902; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2903; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 2904; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 2905; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 2906; GFX10-DL-XNACK-NEXT: s_clause 0x1 2907; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2908; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2909; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2910; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 2911; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 2912; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) 2913; GFX10-DL-XNACK-NEXT: s_clause 0x1 2914; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9] 2915; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[10:11] 2916; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1] 2917; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) 2918; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 2919; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) 2920; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 2921; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 2922; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v2 2923; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 2924; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 2925; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 2926; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v2 2927; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 2928; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 2929; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 2930; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15 2931; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v1 2932; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 2933; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2 2934; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2 2935; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 2936; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 2937; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 2938; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 2939; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15 2940; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 2941; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 2942; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v2 2943; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2 2944; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 2945; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 2946; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 2947; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0 2948; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 2949; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 2950; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 2951; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v17 2952; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16 2953; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8 2954; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 2955; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 2956; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 2957; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 2958; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 2959; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 2960; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 2961; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 2962; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13 2963; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 2964; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v15 2965; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2966; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 2967; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 2968; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 2969; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 2970; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 2971; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v0, v11 2972; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 2973; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v10 2974; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 2975; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v5, v12 2976; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14 2977; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 8, v2 2978; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6 2979; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2980; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2981; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2982; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2983; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13 2984; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) 2985; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v1, v3 2986; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2987; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v9, v3, v10 2988; GFX10-DL-XNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] 2989; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 2990; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v9, v8 2991; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v2 2992; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v12, v0 2993; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1 2994; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 2995; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0 2996; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1 2997; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[0:1] 2998; GFX10-DL-XNACK-NEXT: s_endpgm 2999; 3000; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: 3001; GFX10-DL-NOXNACK: ; %bb.0: ; %entry 3002; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 3003; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 3004; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 3005; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 3006; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 3007; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 3008; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 3009; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 3010; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3011; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0 3012; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 3013; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) 3014; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 3015; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] 3016; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11] 3017; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1] 3018; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) 3019; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 3020; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) 3021; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 3022; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 3023; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v0 3024; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 3025; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 3026; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 3027; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0 3028; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 3029; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 3030; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 3031; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15 3032; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1 3033; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 3034; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0 3035; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0 3036; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 3037; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 3038; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 3039; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 3040; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15 3041; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 3042; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 3043; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0 3044; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0 3045; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 3046; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v3 3047; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 3048; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 3049; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 3050; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 3051; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 3052; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17 3053; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v9, v16 3054; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8 3055; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 3056; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 3057; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 3058; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 3059; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 3060; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3 3061; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 3062; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 3063; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 3064; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 3065; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15 3066; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3067; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 3068; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 3069; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 3070; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 3071; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v11 3072; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13 3073; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v0 3074; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 8, v10 3075; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 3076; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v5, v12 3077; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v11, v7, v14 3078; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 8, v3 3079; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6 3080; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3081; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3082; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3083; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 3084; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13 3085; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) 3086; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v2, v1, v2 3087; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 3088; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v10 3089; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] 3090; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 3091; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v9, v8 3092; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v2 3093; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v12, v0 3094; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 3095; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 3096; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0 3097; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 3098; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[0:1] 3099; GFX10-DL-NOXNACK-NEXT: s_endpgm 3100 ptr addrspace(1) %src2, 3101 ptr addrspace(1) nocapture %dst) { 3102entry: 3103 %idx = call i32 @llvm.amdgcn.workitem.id.x() 3104 %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx 3105 %vec1 = load <8 x i4>, ptr addrspace(1) %gep1 3106 %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx 3107 %vec2 = load <8 x i4>, ptr addrspace(1) %gep2 3108 3109 %cvec1 = sext <8 x i4> %vec1 to <8 x i8> 3110 %cvec2 = sext <8 x i4> %vec2 to <8 x i8> 3111 3112 %mul = mul <8 x i8> %cvec1, %cvec2 3113 %mul0 = extractelement <8 x i8> %mul, i64 0 3114 %mul1 = extractelement <8 x i8> %mul, i64 1 3115 %mul2 = extractelement <8 x i8> %mul, i64 2 3116 %mul3 = extractelement <8 x i8> %mul, i64 3 3117 %mul4 = extractelement <8 x i8> %mul, i64 4 3118 %mul5 = extractelement <8 x i8> %mul, i64 5 3119 %mul6 = extractelement <8 x i8> %mul, i64 6 3120 %mul7 = extractelement <8 x i8> %mul, i64 7 3121 3122 %acc = load i8, ptr addrspace(1) %dst, align 4 3123 %add1 = add i8 %mul0, %acc 3124 %add2 = add i8 %add1, %mul1 3125 %add3 = add i8 %add2, %mul2 3126 %add4 = add i8 %add3, %mul3 3127 %add5 = add i8 %add4, %mul4 3128 %add6 = add i8 %add5, %mul5 3129 %add7 = add i8 %add6, %mul6 3130 %add8 = add i8 %add7, %mul7 3131 3132 store i8 %add8, ptr addrspace(1) %dst, align 4 3133 ret void 3134} 3135 3136declare i32 @llvm.amdgcn.workitem.id.x() 3137