1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=AVXVNNI 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=AVX512,AVX512VNNI 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVNNI 5 6define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) { 7; AVXVNNI-LABEL: no_dpbusd: 8; AVXVNNI: # %bb.0: # %entry 9; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 10; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 11; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 12; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 13; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 14; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 15; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 16; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 17; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 18; AVXVNNI-NEXT: vmovd %xmm0, %eax 19; AVXVNNI-NEXT: addl %edx, %eax 20; AVXVNNI-NEXT: vzeroupper 21; AVXVNNI-NEXT: retq 22; 23; AVX512-LABEL: no_dpbusd: 24; AVX512: # %bb.0: # %entry 25; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 26; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 27; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 28; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 29; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 30; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 31; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 32; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 33; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 34; AVX512-NEXT: vmovd %xmm0, %eax 35; AVX512-NEXT: addl %edx, %eax 36; AVX512-NEXT: vzeroupper 37; AVX512-NEXT: retq 38entry: 39 %0 = load <16 x i8>, ptr %a, align 16 40 %1 = zext <16 x i8> %0 to <16 x i32> 41 %2 = load <16 x i8>, ptr %b, align 16 42 %3 = zext <16 x i8> %2 to <16 x i32> 43 %4 = mul nsw <16 x i32> %3, %1 44 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) 45 %op.extra = add nsw i32 %5, %c 46 ret i32 %op.extra 47} 48 49define i32 @vpdpbusd_mutate(ptr%a, ptr%b, i32 %c, i32 %n) { 50; AVXVNNI-LABEL: vpdpbusd_mutate: 51; AVXVNNI: # %bb.0: # %entry 52; AVXVNNI-NEXT: vmovdqa (%rsi), %xmm0 53; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 54; AVXVNNI-NEXT: {vex} vpdpbusd (%rdi), %xmm0, %xmm1 55; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 56; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 57; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 58; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 59; AVXVNNI-NEXT: vmovd %xmm0, %eax 60; AVXVNNI-NEXT: addl %edx, %eax 61; AVXVNNI-NEXT: retq 62; 63; AVX512VNNI-LABEL: vpdpbusd_mutate: 64; AVX512VNNI: # %bb.0: # %entry 65; AVX512VNNI-NEXT: vmovdqa (%rdi), %xmm0 66; AVX512VNNI-NEXT: vmovdqa (%rsi), %xmm1 67; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 68; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 69; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 70; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 71; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 72; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 73; AVX512VNNI-NEXT: vmovd %xmm0, %eax 74; AVX512VNNI-NEXT: addl %edx, %eax 75; AVX512VNNI-NEXT: vzeroupper 76; AVX512VNNI-NEXT: retq 77; 78; AVX512VLVNNI-LABEL: vpdpbusd_mutate: 79; AVX512VLVNNI: # %bb.0: # %entry 80; AVX512VLVNNI-NEXT: vmovdqa (%rsi), %xmm0 81; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 82; AVX512VLVNNI-NEXT: vpdpbusd (%rdi), %xmm0, %xmm1 83; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 84; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 85; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 86; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 87; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax 88; AVX512VLVNNI-NEXT: addl %edx, %eax 89; AVX512VLVNNI-NEXT: retq 90entry: 91 %0 = load <16 x i8>, ptr %a, align 16 92 %1 = sext <16 x i8> %0 to <16 x i32> 93 %2 = load <16 x i8>, ptr %b, align 16 94 %3 = zext <16 x i8> %2 to <16 x i32> 95 %4 = mul nsw <16 x i32> %3, %1 96 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) 97 %op.extra = add nsw i32 %5, %c 98 ret i32 %op.extra 99} 100 101define i32 @mul_zext(ptr%a, ptr%b, i32 %c, i32 %n) { 102; AVXVNNI-LABEL: mul_zext: 103; AVXVNNI: # %bb.0: # %entry 104; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 105; AVXVNNI-NEXT: vpmovsxbw (%rsi), %ymm1 106; AVXVNNI-NEXT: vpmullw %ymm0, %ymm1, %ymm0 107; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 108; AVXVNNI-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 109; AVXVNNI-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 110; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 111; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 112; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 113; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 114; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 115; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 116; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 117; AVXVNNI-NEXT: vmovd %xmm0, %eax 118; AVXVNNI-NEXT: addl %edx, %eax 119; AVXVNNI-NEXT: vzeroupper 120; AVXVNNI-NEXT: retq 121; 122; AVX512-LABEL: mul_zext: 123; AVX512: # %bb.0: # %entry 124; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 125; AVX512-NEXT: vpmovsxbw (%rsi), %ymm1 126; AVX512-NEXT: vpmullw %ymm0, %ymm1, %ymm0 127; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 128; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 129; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 130; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 131; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 132; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 133; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 134; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 135; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 136; AVX512-NEXT: vmovd %xmm0, %eax 137; AVX512-NEXT: addl %edx, %eax 138; AVX512-NEXT: vzeroupper 139; AVX512-NEXT: retq 140entry: 141 %0 = load <16 x i8>, ptr %a, align 16 142 %1 = zext <16 x i8> %0 to <16 x i16> 143 %2 = load <16 x i8>, ptr %b, align 16 144 %3 = sext <16 x i8> %2 to <16 x i16> 145 %4 = mul nsw <16 x i16> %3, %1 146 ; We can't combine to vpdpbusd for zext, because each of the 4 multiplies 147 ; done by vpdpbusd compute a signed 16-bit product that will be sign extended 148 ; before adding into the accumulator. 149 %5 = zext <16 x i16> %4 to <16 x i32> 150 %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) 151 %op.extra = add nsw i32 %6, %c 152 ret i32 %op.extra 153} 154 155define i32 @mul_sext(ptr%a, ptr%b, i32 %c, i32 %n) { 156; AVXVNNI-LABEL: mul_sext: 157; AVXVNNI: # %bb.0: # %entry 158; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 159; AVXVNNI-NEXT: vpmovsxbw (%rsi), %ymm1 160; AVXVNNI-NEXT: vpmullw %ymm0, %ymm1, %ymm0 161; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 162; AVXVNNI-NEXT: vpmovsxwd %xmm1, %ymm1 163; AVXVNNI-NEXT: vpmovsxwd %xmm0, %ymm0 164; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 165; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 166; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 167; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 168; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 169; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 170; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 171; AVXVNNI-NEXT: vmovd %xmm0, %eax 172; AVXVNNI-NEXT: addl %edx, %eax 173; AVXVNNI-NEXT: vzeroupper 174; AVXVNNI-NEXT: retq 175; 176; AVX512-LABEL: mul_sext: 177; AVX512: # %bb.0: # %entry 178; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 179; AVX512-NEXT: vpmovsxbw (%rsi), %ymm1 180; AVX512-NEXT: vpmullw %ymm0, %ymm1, %ymm0 181; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 182; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 183; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 184; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 185; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 186; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 187; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 188; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 189; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 190; AVX512-NEXT: vmovd %xmm0, %eax 191; AVX512-NEXT: addl %edx, %eax 192; AVX512-NEXT: vzeroupper 193; AVX512-NEXT: retq 194entry: 195 %0 = load <16 x i8>, ptr %a, align 16 196 %1 = zext <16 x i8> %0 to <16 x i16> 197 %2 = load <16 x i8>, ptr %b, align 16 198 %3 = sext <16 x i8> %2 to <16 x i16> 199 %4 = mul nsw <16 x i16> %3, %1 200 ; TODO: 201 ; We also need to verify that the multiply has at least 2x the number of bits 202 ; of the input. We shouldn't match 203 ; (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))). 204 %5 = sext <16 x i16> %4 to <16 x i32> 205 %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) 206 %op.extra = add nsw i32 %6, %c 207 ret i32 %op.extra 208} 209 210define i32 @vpdpbusd_512(ptr%a, ptr%b, i32 %c, i32 %n) { 211; AVXVNNI-LABEL: vpdpbusd_512: 212; AVXVNNI: # %bb.0: # %entry 213; AVXVNNI-NEXT: vmovdqa (%rdi), %xmm0 214; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 215; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %xmm0, %xmm1 216; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 217; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 218; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 219; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 220; AVXVNNI-NEXT: vmovd %xmm0, %eax 221; AVXVNNI-NEXT: addl %edx, %eax 222; AVXVNNI-NEXT: retq 223; 224; AVX512VNNI-LABEL: vpdpbusd_512: 225; AVX512VNNI: # %bb.0: # %entry 226; AVX512VNNI-NEXT: vmovdqa (%rdi), %xmm0 227; AVX512VNNI-NEXT: vmovdqa (%rsi), %xmm1 228; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 229; AVX512VNNI-NEXT: vpdpbusd %zmm1, %zmm0, %zmm2 230; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 231; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 232; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 233; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 234; AVX512VNNI-NEXT: vmovd %xmm0, %eax 235; AVX512VNNI-NEXT: addl %edx, %eax 236; AVX512VNNI-NEXT: vzeroupper 237; AVX512VNNI-NEXT: retq 238; 239; AVX512VLVNNI-LABEL: vpdpbusd_512: 240; AVX512VLVNNI: # %bb.0: # %entry 241; AVX512VLVNNI-NEXT: vmovdqa (%rdi), %xmm0 242; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 243; AVX512VLVNNI-NEXT: vpdpbusd (%rsi), %xmm0, %xmm1 244; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 245; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 246; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 247; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 248; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax 249; AVX512VLVNNI-NEXT: addl %edx, %eax 250; AVX512VLVNNI-NEXT: retq 251entry: 252 %0 = load <16 x i8>, ptr %a, align 16 253 %1 = zext <16 x i8> %0 to <16 x i32> 254 %2 = load <16 x i8>, ptr %b, align 16 255 %3 = sext <16 x i8> %2 to <16 x i32> 256 %4 = mul nsw <16 x i32> %3, %1 257 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) 258 %op.extra = add nsw i32 %5, %c 259 ret i32 %op.extra 260} 261 262declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 263 264define i32 @vpdpbusd_256(ptr%a, ptr%b, i32 %c, i32 %n) { 265; AVXVNNI-LABEL: vpdpbusd_256: 266; AVXVNNI: # %bb.0: # %entry 267; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 268; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 269; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 270; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm1, %xmm2 271; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] 272; AVXVNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 273; AVXVNNI-NEXT: vmovd %xmm0, %eax 274; AVXVNNI-NEXT: addl %edx, %eax 275; AVXVNNI-NEXT: retq 276; 277; AVX512VNNI-LABEL: vpdpbusd_256: 278; AVX512VNNI: # %bb.0: # %entry 279; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 280; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 281; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 282; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 283; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] 284; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 285; AVX512VNNI-NEXT: vmovd %xmm0, %eax 286; AVX512VNNI-NEXT: addl %edx, %eax 287; AVX512VNNI-NEXT: vzeroupper 288; AVX512VNNI-NEXT: retq 289; 290; AVX512VLVNNI-LABEL: vpdpbusd_256: 291; AVX512VLVNNI: # %bb.0: # %entry 292; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 293; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 294; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 295; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 296; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] 297; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 298; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax 299; AVX512VLVNNI-NEXT: addl %edx, %eax 300; AVX512VLVNNI-NEXT: retq 301entry: 302 %0 = load <8 x i8>, ptr %a, align 8 303 %1 = zext <8 x i8> %0 to <8 x i32> 304 %2 = load <8 x i8>, ptr %b, align 8 305 %3 = sext <8 x i8> %2 to <8 x i32> 306 %4 = mul nsw <8 x i32> %3, %1 307 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) 308 %op.extra = add nsw i32 %5, %c 309 ret i32 %op.extra 310} 311 312declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 313 314define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) { 315; AVXVNNI-LABEL: vpdpbusd_128: 316; AVXVNNI: # %bb.0: # %entry 317; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 318; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 319; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 320; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 321; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 322; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 323; AVXVNNI-NEXT: vmovd %xmm2, %eax 324; AVXVNNI-NEXT: addl %edx, %eax 325; AVXVNNI-NEXT: retq 326; 327; AVX512VNNI-LABEL: vpdpbusd_128: 328; AVX512VNNI: # %bb.0: # %entry 329; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 330; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 331; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 332; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 333; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] 334; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 335; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 336; AVX512VNNI-NEXT: vmovd %xmm2, %eax 337; AVX512VNNI-NEXT: addl %edx, %eax 338; AVX512VNNI-NEXT: vzeroupper 339; AVX512VNNI-NEXT: retq 340; 341; AVX512VLVNNI-LABEL: vpdpbusd_128: 342; AVX512VLVNNI: # %bb.0: # %entry 343; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 344; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 345; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 346; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] 347; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 348; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 349; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 350; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax 351; AVX512VLVNNI-NEXT: addl %edx, %eax 352; AVX512VLVNNI-NEXT: retq 353entry: 354 %0 = load <4 x i8>, ptr %a, align 8 355 %1 = zext <4 x i8> %0 to <4 x i32> 356 %2 = load <4 x i8>, ptr %b, align 8 357 %3 = sext <4 x i8> %2 to <4 x i32> 358 %4 = mul nsw <4 x i32> %3, %1 359 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) 360 %op.extra = add nsw i32 %5, %c 361 ret i32 %op.extra 362} 363 364declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 365 366define i32 @vpdpbusd_2xi32(ptr%a, ptr%b, i32 %c, i32 %n) { 367; AVXVNNI-LABEL: vpdpbusd_2xi32: 368; AVXVNNI: # %bb.0: # %entry 369; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 370; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 371; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 372; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 373; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 374; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 375; AVXVNNI-NEXT: vmovd %xmm2, %eax 376; AVXVNNI-NEXT: addl %edx, %eax 377; AVXVNNI-NEXT: retq 378; 379; AVX512VNNI-LABEL: vpdpbusd_2xi32: 380; AVX512VNNI: # %bb.0: # %entry 381; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 382; AVX512VNNI-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] 383; AVX512VNNI-NEXT: vpandq %zmm1, %zmm0, %zmm0 384; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 385; AVX512VNNI-NEXT: vpandq %zmm1, %zmm2, %zmm1 386; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 387; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 388; AVX512VNNI-NEXT: vmovd %xmm2, %eax 389; AVX512VNNI-NEXT: addl %edx, %eax 390; AVX512VNNI-NEXT: vzeroupper 391; AVX512VNNI-NEXT: retq 392; 393; AVX512VLVNNI-LABEL: vpdpbusd_2xi32: 394; AVX512VLVNNI: # %bb.0: # %entry 395; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 396; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 397; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 398; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 399; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] 400; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 401; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax 402; AVX512VLVNNI-NEXT: addl %edx, %eax 403; AVX512VLVNNI-NEXT: retq 404entry: 405 %0 = load <2 x i8>, ptr %a, align 8 406 %1 = zext <2 x i8> %0 to <2 x i32> 407 %2 = load <2 x i8>, ptr %b, align 8 408 %3 = sext <2 x i8> %2 to <2 x i32> 409 %4 = mul nsw <2 x i32> %3, %1 410 %5 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %4) 411 %op.extra = add nsw i32 %5, %c 412 ret i32 %op.extra 413} 414 415declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) 416 417define i32 @vpdpbusd_32xi32(ptr%a, ptr%b, i32 %c, i32 %n) { 418; AVXVNNI-LABEL: vpdpbusd_32xi32: 419; AVXVNNI: # %bb.0: # %entry 420; AVXVNNI-NEXT: vmovdqu (%rdi), %ymm0 421; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 422; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %ymm0, %ymm1 423; AVXVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 424; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 425; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 426; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 427; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 428; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 429; AVXVNNI-NEXT: vmovd %xmm0, %eax 430; AVXVNNI-NEXT: addl %edx, %eax 431; AVXVNNI-NEXT: vzeroupper 432; AVXVNNI-NEXT: retq 433; 434; AVX512VNNI-LABEL: vpdpbusd_32xi32: 435; AVX512VNNI: # %bb.0: # %entry 436; AVX512VNNI-NEXT: vmovdqu (%rdi), %ymm0 437; AVX512VNNI-NEXT: vmovdqu (%rsi), %ymm1 438; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 439; AVX512VNNI-NEXT: vpdpbusd %zmm1, %zmm0, %zmm2 440; AVX512VNNI-NEXT: vextracti128 $1, %ymm2, %xmm0 441; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 442; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 443; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 444; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 445; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 446; AVX512VNNI-NEXT: vmovd %xmm0, %eax 447; AVX512VNNI-NEXT: addl %edx, %eax 448; AVX512VNNI-NEXT: vzeroupper 449; AVX512VNNI-NEXT: retq 450; 451; AVX512VLVNNI-LABEL: vpdpbusd_32xi32: 452; AVX512VLVNNI: # %bb.0: # %entry 453; AVX512VLVNNI-NEXT: vmovdqu (%rdi), %ymm0 454; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 455; AVX512VLVNNI-NEXT: vpdpbusd (%rsi), %ymm0, %ymm1 456; AVX512VLVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 457; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 458; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 459; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 460; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 461; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 462; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax 463; AVX512VLVNNI-NEXT: addl %edx, %eax 464; AVX512VLVNNI-NEXT: vzeroupper 465; AVX512VLVNNI-NEXT: retq 466entry: 467 %0 = load <32 x i8>, ptr %a, align 16 468 %1 = zext <32 x i8> %0 to <32 x i32> 469 %2 = load <32 x i8>, ptr %b, align 16 470 %3 = sext <32 x i8> %2 to <32 x i32> 471 %4 = mul nsw <32 x i32> %3, %1 472 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4) 473 %op.extra = add nsw i32 %5, %c 474 ret i32 %op.extra 475} 476 477declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) 478 479define i32 @vpdpbusd_64xi32(ptr%a, ptr%b, i32 %c, i32 %n) { 480; AVXVNNI-LABEL: vpdpbusd_64xi32: 481; AVXVNNI: # %bb.0: # %entry 482; AVXVNNI-NEXT: vmovdqu (%rdi), %ymm0 483; AVXVNNI-NEXT: vmovdqu 32(%rdi), %ymm1 484; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 485; AVXVNNI-NEXT: vpxor %xmm3, %xmm3, %xmm3 486; AVXVNNI-NEXT: {vex} vpdpbusd 32(%rsi), %ymm1, %ymm3 487; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %ymm0, %ymm2 488; AVXVNNI-NEXT: vpaddd %ymm3, %ymm2, %ymm0 489; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 490; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 491; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 492; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 493; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 494; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 495; AVXVNNI-NEXT: vmovd %xmm0, %eax 496; AVXVNNI-NEXT: addl %edx, %eax 497; AVXVNNI-NEXT: vzeroupper 498; AVXVNNI-NEXT: retq 499; 500; AVX512-LABEL: vpdpbusd_64xi32: 501; AVX512: # %bb.0: # %entry 502; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 503; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 504; AVX512-NEXT: vpdpbusd (%rsi), %zmm0, %zmm1 505; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 506; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 507; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 508; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 509; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 510; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 511; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 512; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 513; AVX512-NEXT: vmovd %xmm0, %eax 514; AVX512-NEXT: addl %edx, %eax 515; AVX512-NEXT: vzeroupper 516; AVX512-NEXT: retq 517entry: 518 %0 = load <64 x i8>, ptr %a, align 16 519 %1 = zext <64 x i8> %0 to <64 x i32> 520 %2 = load <64 x i8>, ptr %b, align 16 521 %3 = sext <64 x i8> %2 to <64 x i32> 522 %4 = mul nsw <64 x i32> %3, %1 523 %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4) 524 %op.extra = add nsw i32 %5, %c 525 ret i32 %op.extra 526} 527 528declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) 529