1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8 < %s | FileCheck %s 3 4declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>) 5declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>) 6declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>) 7declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>) 8declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>) 9declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>) 10declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>) 11declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>) 12declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>) 13declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>) 14declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>) 15declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>) 16 17define <4 x i32> @stack_fold_vpdpbssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 18; CHECK-LABEL: stack_fold_vpdpbssd: 19; CHECK: # %bb.0: 20; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 21; CHECK-NEXT: #APP 22; CHECK-NEXT: nop 23; CHECK-NEXT: #NO_APP 24; CHECK-NEXT: vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 25; CHECK-NEXT: retq 26 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 27 %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) 28 ret <4 x i32> %2 29} 30 31define <4 x i32> @stack_fold_vpdpbssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 32; CHECK-LABEL: stack_fold_vpdpbssd_commuted: 33; CHECK: # %bb.0: 34; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 35; CHECK-NEXT: #APP 36; CHECK-NEXT: nop 37; CHECK-NEXT: #NO_APP 38; CHECK-NEXT: vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 39; CHECK-NEXT: retq 40 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 41 %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) 42 ret <4 x i32> %2 43} 44 45define <8 x i32> @stack_fold_vpdpbssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { 46; CHECK-LABEL: stack_fold_vpdpbssd_256: 47; CHECK: # %bb.0: 48; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 49; CHECK-NEXT: #APP 50; CHECK-NEXT: nop 51; CHECK-NEXT: #NO_APP 52; CHECK-NEXT: vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 53; CHECK-NEXT: retq 54 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 55 %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) 56 ret <8 x i32> %2 57} 58 59define <8 x i32> @stack_fold_vpdpbssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { 60; CHECK-LABEL: stack_fold_vpdpbssd_256_commuted: 61; CHECK: # %bb.0: 62; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 63; CHECK-NEXT: #APP 64; CHECK-NEXT: nop 65; CHECK-NEXT: #NO_APP 66; CHECK-NEXT: vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 67; CHECK-NEXT: retq 68 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 69 %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) 70 ret <8 x i32> %2 71} 72 73define <4 x i32> @stack_fold_vpdpbssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 74; CHECK-LABEL: stack_fold_vpdpbssds: 75; CHECK: # %bb.0: 76; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 77; CHECK-NEXT: #APP 78; CHECK-NEXT: nop 79; CHECK-NEXT: #NO_APP 80; CHECK-NEXT: vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 81; CHECK-NEXT: retq 82 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 83 %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) 84 ret <4 x i32> %2 85} 86 87define <4 x i32> @stack_fold_vpdpbssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 88; CHECK-LABEL: stack_fold_vpdpbssds_commuted: 89; CHECK: # %bb.0: 90; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 91; CHECK-NEXT: #APP 92; CHECK-NEXT: nop 93; CHECK-NEXT: #NO_APP 94; CHECK-NEXT: vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 95; CHECK-NEXT: retq 96 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 97 %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) 98 ret <4 x i32> %2 99} 100 101define <8 x i32> @stack_fold_vpdpbssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { 102; CHECK-LABEL: stack_fold_vpdpbssds_256: 103; CHECK: # %bb.0: 104; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 105; CHECK-NEXT: #APP 106; CHECK-NEXT: nop 107; CHECK-NEXT: #NO_APP 108; CHECK-NEXT: vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 109; CHECK-NEXT: retq 110 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 111 %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) 112 ret <8 x i32> %2 113} 114 115define <8 x i32> @stack_fold_vpdpbssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { 116; CHECK-LABEL: stack_fold_vpdpbssds_256_commuted: 117; CHECK: # %bb.0: 118; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 119; CHECK-NEXT: #APP 120; CHECK-NEXT: nop 121; CHECK-NEXT: #NO_APP 122; CHECK-NEXT: vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 123; CHECK-NEXT: retq 124 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 125 %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) 126 ret <8 x i32> %2 127} 128 129define <4 x i32> @stack_fold_vpdpbsud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 130; CHECK-LABEL: stack_fold_vpdpbsud: 131; CHECK: # %bb.0: 132; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 133; CHECK-NEXT: #APP 134; CHECK-NEXT: nop 135; CHECK-NEXT: #NO_APP 136; CHECK-NEXT: vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 137; CHECK-NEXT: retq 138 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 139 %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) 140 ret <4 x i32> %2 141} 142 143define <4 x i32> @stack_fold_vpdpbsud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 144; CHECK-LABEL: stack_fold_vpdpbsud_commuted: 145; CHECK: # %bb.0: 146; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 147; CHECK-NEXT: #APP 148; CHECK-NEXT: nop 149; CHECK-NEXT: #NO_APP 150; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 151; CHECK-NEXT: vpdpbsud %xmm1, %xmm2, %xmm0 152; CHECK-NEXT: retq 153 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 154 %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) 155 ret <4 x i32> %2 156} 157 158define <8 x i32> @stack_fold_vpdpbsud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { 159; CHECK-LABEL: stack_fold_vpdpbsud_256: 160; CHECK: # %bb.0: 161; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 162; CHECK-NEXT: #APP 163; CHECK-NEXT: nop 164; CHECK-NEXT: #NO_APP 165; CHECK-NEXT: vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 166; CHECK-NEXT: retq 167 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 168 %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) 169 ret <8 x i32> %2 170} 171 172define <8 x i32> @stack_fold_vpdpbsud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { 173; CHECK-LABEL: stack_fold_vpdpbsud_256_commuted: 174; CHECK: # %bb.0: 175; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 176; CHECK-NEXT: #APP 177; CHECK-NEXT: nop 178; CHECK-NEXT: #NO_APP 179; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 180; CHECK-NEXT: vpdpbsud %ymm1, %ymm2, %ymm0 181; CHECK-NEXT: retq 182 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 183 %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) 184 ret <8 x i32> %2 185} 186 187define <4 x i32> @stack_fold_vpdpbsuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 188; CHECK-LABEL: stack_fold_vpdpbsuds: 189; CHECK: # %bb.0: 190; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 191; CHECK-NEXT: #APP 192; CHECK-NEXT: nop 193; CHECK-NEXT: #NO_APP 194; CHECK-NEXT: vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 195; CHECK-NEXT: retq 196 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 197 %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) 198 ret <4 x i32> %2 199} 200 201define <4 x i32> @stack_fold_vpdpbsuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 202; CHECK-LABEL: stack_fold_vpdpbsuds_commuted: 203; CHECK: # %bb.0: 204; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 205; CHECK-NEXT: #APP 206; CHECK-NEXT: nop 207; CHECK-NEXT: #NO_APP 208; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 209; CHECK-NEXT: vpdpbsuds %xmm1, %xmm2, %xmm0 210; CHECK-NEXT: retq 211 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 212 %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) 213 ret <4 x i32> %2 214} 215 216define <8 x i32> @stack_fold_vpdpbsuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { 217; CHECK-LABEL: stack_fold_vpdpbsuds_256: 218; CHECK: # %bb.0: 219; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 220; CHECK-NEXT: #APP 221; CHECK-NEXT: nop 222; CHECK-NEXT: #NO_APP 223; CHECK-NEXT: vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 224; CHECK-NEXT: retq 225 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 226 %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) 227 ret <8 x i32> %2 228} 229 230define <8 x i32> @stack_fold_vpdpbsuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { 231; CHECK-LABEL: stack_fold_vpdpbsuds_256_commuted: 232; CHECK: # %bb.0: 233; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 234; CHECK-NEXT: #APP 235; CHECK-NEXT: nop 236; CHECK-NEXT: #NO_APP 237; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 238; CHECK-NEXT: vpdpbsuds %ymm1, %ymm2, %ymm0 239; CHECK-NEXT: retq 240 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 241 %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) 242 ret <8 x i32> %2 243} 244 245define <4 x i32> @stack_fold_vpdpbuud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 246; CHECK-LABEL: stack_fold_vpdpbuud: 247; CHECK: # %bb.0: 248; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 249; CHECK-NEXT: #APP 250; CHECK-NEXT: nop 251; CHECK-NEXT: #NO_APP 252; CHECK-NEXT: vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 253; CHECK-NEXT: retq 254 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 255 %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) 256 ret <4 x i32> %2 257} 258 259define <4 x i32> @stack_fold_vpdpbuud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 260; CHECK-LABEL: stack_fold_vpdpbuud_commuted: 261; CHECK: # %bb.0: 262; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 263; CHECK-NEXT: #APP 264; CHECK-NEXT: nop 265; CHECK-NEXT: #NO_APP 266; CHECK-NEXT: vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 267; CHECK-NEXT: retq 268 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 269 %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) 270 ret <4 x i32> %2 271} 272 273define <8 x i32> @stack_fold_vpdpbuud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { 274; CHECK-LABEL: stack_fold_vpdpbuud_256: 275; CHECK: # %bb.0: 276; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 277; CHECK-NEXT: #APP 278; CHECK-NEXT: nop 279; CHECK-NEXT: #NO_APP 280; CHECK-NEXT: vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 281; CHECK-NEXT: retq 282 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 283 %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) 284 ret <8 x i32> %2 285} 286 287define <8 x i32> @stack_fold_vpdpbuud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { 288; CHECK-LABEL: stack_fold_vpdpbuud_256_commuted: 289; CHECK: # %bb.0: 290; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 291; CHECK-NEXT: #APP 292; CHECK-NEXT: nop 293; CHECK-NEXT: #NO_APP 294; CHECK-NEXT: vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 295; CHECK-NEXT: retq 296 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 297 %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) 298 ret <8 x i32> %2 299} 300 301define <4 x i32> @stack_fold_vpdpbuuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 302; CHECK-LABEL: stack_fold_vpdpbuuds: 303; CHECK: # %bb.0: 304; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 305; CHECK-NEXT: #APP 306; CHECK-NEXT: nop 307; CHECK-NEXT: #NO_APP 308; CHECK-NEXT: vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 309; CHECK-NEXT: retq 310 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 311 %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) 312 ret <4 x i32> %2 313} 314 315define <4 x i32> @stack_fold_vpdpbuuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 316; CHECK-LABEL: stack_fold_vpdpbuuds_commuted: 317; CHECK: # %bb.0: 318; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 319; CHECK-NEXT: #APP 320; CHECK-NEXT: nop 321; CHECK-NEXT: #NO_APP 322; CHECK-NEXT: vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 323; CHECK-NEXT: retq 324 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 325 %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) 326 ret <4 x i32> %2 327} 328 329define <8 x i32> @stack_fold_vpdpbuuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { 330; CHECK-LABEL: stack_fold_vpdpbuuds_256: 331; CHECK: # %bb.0: 332; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 333; CHECK-NEXT: #APP 334; CHECK-NEXT: nop 335; CHECK-NEXT: #NO_APP 336; CHECK-NEXT: vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 337; CHECK-NEXT: retq 338 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 339 %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) 340 ret <8 x i32> %2 341} 342 343define <8 x i32> @stack_fold_vpdpbuuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { 344; CHECK-LABEL: stack_fold_vpdpbuuds_256_commuted: 345; CHECK: # %bb.0: 346; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 347; CHECK-NEXT: #APP 348; CHECK-NEXT: nop 349; CHECK-NEXT: #NO_APP 350; CHECK-NEXT: vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 351; CHECK-NEXT: retq 352 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 353 %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) 354 ret <8 x i32> %2 355} 356