1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefix=AVX 3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512 4; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avx512bw,+avxvnni | FileCheck %s --check-prefix=AVX 5 6define <4 x i32> @test_pmaddwd_v8i16_add_v4i32(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) { 7; AVX-LABEL: test_pmaddwd_v8i16_add_v4i32: 8; AVX: # %bb.0: 9; AVX-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 10; AVX-NEXT: retq 11; 12; AVX512-LABEL: test_pmaddwd_v8i16_add_v4i32: 13; AVX512: # %bb.0: 14; AVX512-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0 15; AVX512-NEXT: retq 16 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2) 17 %2 = add <4 x i32> %1, %a0 18 ret <4 x i32> %2 19} 20 21define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_commute(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) { 22; AVX-LABEL: test_pmaddwd_v8i16_add_v4i32_commute: 23; AVX: # %bb.0: 24; AVX-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 25; AVX-NEXT: retq 26; 27; AVX512-LABEL: test_pmaddwd_v8i16_add_v4i32_commute: 28; AVX512: # %bb.0: 29; AVX512-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0 30; AVX512-NEXT: retq 31 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2) 32 %2 = add <4 x i32> %a0, %1 33 ret <4 x i32> %2 34} 35 36define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_load1(<4 x i32> %a0, ptr %p1, <8 x i16> %a2) { 37; AVX-LABEL: test_pmaddwd_v8i16_add_v4i32_load1: 38; AVX: # %bb.0: 39; AVX-NEXT: {vex} vpdpwssd (%rdi), %xmm1, %xmm0 40; AVX-NEXT: retq 41; 42; AVX512-LABEL: test_pmaddwd_v8i16_add_v4i32_load1: 43; AVX512: # %bb.0: 44; AVX512-NEXT: vpdpwssd (%rdi), %xmm1, %xmm0 45; AVX512-NEXT: retq 46 %a1 = load <8 x i16>, ptr %p1 47 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2) 48 %2 = add <4 x i32> %1, %a0 49 ret <4 x i32> %2 50} 51 52define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_load2(<4 x i32> %a0, <8 x i16> %a1, ptr %p2) { 53; AVX-LABEL: test_pmaddwd_v8i16_add_v4i32_load2: 54; AVX: # %bb.0: 55; AVX-NEXT: {vex} vpdpwssd (%rdi), %xmm1, %xmm0 56; AVX-NEXT: retq 57; 58; AVX512-LABEL: test_pmaddwd_v8i16_add_v4i32_load2: 59; AVX512: # %bb.0: 60; AVX512-NEXT: vpdpwssd (%rdi), %xmm1, %xmm0 61; AVX512-NEXT: retq 62 %a2 = load <8 x i16>, ptr %p2 63 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2) 64 %2 = add <4 x i32> %1, %a0 65 ret <4 x i32> %2 66} 67 68define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_commute_load1(<4 x i32> %a0, ptr %p1, <8 x i16> %a2) { 69; AVX-LABEL: test_pmaddwd_v8i16_add_v4i32_commute_load1: 70; AVX: # %bb.0: 71; AVX-NEXT: {vex} vpdpwssd (%rdi), %xmm1, %xmm0 72; AVX-NEXT: retq 73; 74; AVX512-LABEL: test_pmaddwd_v8i16_add_v4i32_commute_load1: 75; AVX512: # %bb.0: 76; AVX512-NEXT: vpdpwssd (%rdi), %xmm1, %xmm0 77; AVX512-NEXT: retq 78 %a1 = load <8 x i16>, ptr %p1 79 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2) 80 %2 = add <4 x i32> %a0, %1 81 ret <4 x i32> %2 82} 83 84define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_commute_load2(<4 x i32> %a0, <8 x i16> %a1, ptr %p2) { 85; AVX-LABEL: test_pmaddwd_v8i16_add_v4i32_commute_load2: 86; AVX: # %bb.0: 87; AVX-NEXT: {vex} vpdpwssd (%rdi), %xmm1, %xmm0 88; AVX-NEXT: retq 89; 90; AVX512-LABEL: test_pmaddwd_v8i16_add_v4i32_commute_load2: 91; AVX512: # %bb.0: 92; AVX512-NEXT: vpdpwssd (%rdi), %xmm1, %xmm0 93; AVX512-NEXT: retq 94 %a2 = load <8 x i16>, ptr %p2 95 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2) 96 %2 = add <4 x i32> %a0, %1 97 ret <4 x i32> %2 98} 99 100define <8 x i32> @test_pmaddwd_v16i16_add_v8i32(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) { 101; AVX-LABEL: test_pmaddwd_v16i16_add_v8i32: 102; AVX: # %bb.0: 103; AVX-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 104; AVX-NEXT: retq 105; 106; AVX512-LABEL: test_pmaddwd_v16i16_add_v8i32: 107; AVX512: # %bb.0: 108; AVX512-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0 109; AVX512-NEXT: retq 110 %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2) 111 %2 = add <8 x i32> %1, %a0 112 ret <8 x i32> %2 113} 114 115define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_commute(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) { 116; AVX-LABEL: test_pmaddwd_v16i16_add_v8i32_commute: 117; AVX: # %bb.0: 118; AVX-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 119; AVX-NEXT: retq 120; 121; AVX512-LABEL: test_pmaddwd_v16i16_add_v8i32_commute: 122; AVX512: # %bb.0: 123; AVX512-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0 124; AVX512-NEXT: retq 125 %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2) 126 %2 = add <8 x i32> %a0, %1 127 ret <8 x i32> %2 128} 129 130define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_load1(<8 x i32> %a0, ptr %p1, <16 x i16> %a2) { 131; AVX-LABEL: test_pmaddwd_v16i16_add_v8i32_load1: 132; AVX: # %bb.0: 133; AVX-NEXT: {vex} vpdpwssd (%rdi), %ymm1, %ymm0 134; AVX-NEXT: retq 135; 136; AVX512-LABEL: test_pmaddwd_v16i16_add_v8i32_load1: 137; AVX512: # %bb.0: 138; AVX512-NEXT: vpdpwssd (%rdi), %ymm1, %ymm0 139; AVX512-NEXT: retq 140 %a1 = load <16 x i16>, ptr %p1 141 %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2) 142 %2 = add <8 x i32> %1, %a0 143 ret <8 x i32> %2 144} 145 146define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_load2(<8 x i32> %a0, <16 x i16> %a1, ptr %p2) { 147; AVX-LABEL: test_pmaddwd_v16i16_add_v8i32_load2: 148; AVX: # %bb.0: 149; AVX-NEXT: {vex} vpdpwssd (%rdi), %ymm1, %ymm0 150; AVX-NEXT: retq 151; 152; AVX512-LABEL: test_pmaddwd_v16i16_add_v8i32_load2: 153; AVX512: # %bb.0: 154; AVX512-NEXT: vpdpwssd (%rdi), %ymm1, %ymm0 155; AVX512-NEXT: retq 156 %a2 = load <16 x i16>, ptr %p2 157 %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2) 158 %2 = add <8 x i32> %1, %a0 159 ret <8 x i32> %2 160} 161 162define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_commute_load1(<8 x i32> %a0, ptr %p1, <16 x i16> %a2) { 163; AVX-LABEL: test_pmaddwd_v16i16_add_v8i32_commute_load1: 164; AVX: # %bb.0: 165; AVX-NEXT: {vex} vpdpwssd (%rdi), %ymm1, %ymm0 166; AVX-NEXT: retq 167; 168; AVX512-LABEL: test_pmaddwd_v16i16_add_v8i32_commute_load1: 169; AVX512: # %bb.0: 170; AVX512-NEXT: vpdpwssd (%rdi), %ymm1, %ymm0 171; AVX512-NEXT: retq 172 %a1 = load <16 x i16>, ptr %p1 173 %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2) 174 %2 = add <8 x i32> %a0, %1 175 ret <8 x i32> %2 176} 177 178define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_commute_load2(<8 x i32> %a0, <16 x i16> %a1, ptr %p2) { 179; AVX-LABEL: test_pmaddwd_v16i16_add_v8i32_commute_load2: 180; AVX: # %bb.0: 181; AVX-NEXT: {vex} vpdpwssd (%rdi), %ymm1, %ymm0 182; AVX-NEXT: retq 183; 184; AVX512-LABEL: test_pmaddwd_v16i16_add_v8i32_commute_load2: 185; AVX512: # %bb.0: 186; AVX512-NEXT: vpdpwssd (%rdi), %ymm1, %ymm0 187; AVX512-NEXT: retq 188 %a2 = load <16 x i16>, ptr %p2 189 %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2) 190 %2 = add <8 x i32> %a0, %1 191 ret <8 x i32> %2 192} 193 194declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) 195declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) 196