xref: /llvm-project/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll (revision 1c154bd755153b5c6ada4bbed58facf23f6abffc)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -O3 -disable-peephole -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
3
4declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
5declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
6declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
7declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
8declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
9declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
10declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
11declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
12declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
13declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
14declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
15declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
16
17define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
18; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
19; CHECK:       # %bb.0:
20; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
21; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
22; CHECK-NEXT:    #APP
23; CHECK-NEXT:    nop # encoding: [0x90]
24; CHECK-NEXT:    #NO_APP
25; CHECK-NEXT:    vpdpwsud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
26; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x72,0xd2,0x44,0x24,0xe8]
27; CHECK-NEXT:    retq # encoding: [0xc3]
28  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
29  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
30  ret <4 x i32> %ret
31}
32
33define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
34; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
35; CHECK:       # %bb.0:
36; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
37; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
38; CHECK-NEXT:    #APP
39; CHECK-NEXT:    nop # encoding: [0x90]
40; CHECK-NEXT:    #NO_APP
41; CHECK-NEXT:    vpdpwsud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
42; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x76,0xd2,0x44,0x24,0xd8]
43; CHECK-NEXT:    retq # encoding: [0xc3]
44  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
45  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
46  ret <8 x i32> %ret
47}
48
49define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
50; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
51; CHECK:       # %bb.0:
52; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
53; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
54; CHECK-NEXT:    #APP
55; CHECK-NEXT:    nop # encoding: [0x90]
56; CHECK-NEXT:    #NO_APP
57; CHECK-NEXT:    vpdpwsuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
58; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x72,0xd3,0x44,0x24,0xe8]
59; CHECK-NEXT:    retq # encoding: [0xc3]
60  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
61  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
62  ret <4 x i32> %ret
63}
64
65define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
66; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
67; CHECK:       # %bb.0:
68; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
69; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
70; CHECK-NEXT:    #APP
71; CHECK-NEXT:    nop # encoding: [0x90]
72; CHECK-NEXT:    #NO_APP
73; CHECK-NEXT:    vpdpwsuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
74; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x76,0xd3,0x44,0x24,0xd8]
75; CHECK-NEXT:    retq # encoding: [0xc3]
76  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
77  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
78  ret <8 x i32> %ret
79}
80
81define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
82; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128:
83; CHECK:       # %bb.0:
84; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
85; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
86; CHECK-NEXT:    #APP
87; CHECK-NEXT:    nop # encoding: [0x90]
88; CHECK-NEXT:    #NO_APP
89; CHECK-NEXT:    vpdpwusd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
90; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x71,0xd2,0x44,0x24,0xe8]
91; CHECK-NEXT:    retq # encoding: [0xc3]
92  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
93  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
94  ret <4 x i32> %ret
95}
96
97define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
98; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256:
99; CHECK:       # %bb.0:
100; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
101; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
102; CHECK-NEXT:    #APP
103; CHECK-NEXT:    nop # encoding: [0x90]
104; CHECK-NEXT:    #NO_APP
105; CHECK-NEXT:    vpdpwusd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
106; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x75,0xd2,0x44,0x24,0xd8]
107; CHECK-NEXT:    retq # encoding: [0xc3]
108  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
109  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
110  ret <8 x i32> %ret
111}
112
113define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
114; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128:
115; CHECK:       # %bb.0:
116; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
117; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
118; CHECK-NEXT:    #APP
119; CHECK-NEXT:    nop # encoding: [0x90]
120; CHECK-NEXT:    #NO_APP
121; CHECK-NEXT:    vpdpwusds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
122; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x71,0xd3,0x44,0x24,0xe8]
123; CHECK-NEXT:    retq # encoding: [0xc3]
124  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
125  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
126  ret <4 x i32> %ret
127}
128
129define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
130; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256:
131; CHECK:       # %bb.0:
132; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
133; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
134; CHECK-NEXT:    #APP
135; CHECK-NEXT:    nop # encoding: [0x90]
136; CHECK-NEXT:    #NO_APP
137; CHECK-NEXT:    vpdpwusds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
138; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x75,0xd3,0x44,0x24,0xd8]
139; CHECK-NEXT:    retq # encoding: [0xc3]
140  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
141  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
142  ret <8 x i32> %ret
143}
144
145define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
146; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128:
147; CHECK:       # %bb.0:
148; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
149; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
150; CHECK-NEXT:    #APP
151; CHECK-NEXT:    nop # encoding: [0x90]
152; CHECK-NEXT:    #NO_APP
153; CHECK-NEXT:    vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
154; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x70,0xd2,0x44,0x24,0xe8]
155; CHECK-NEXT:    retq # encoding: [0xc3]
156  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
157  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
158  ret <4 x i32> %ret
159}
160
161define <4 x i32> @test_int_x86_avx2_vpdpwuud_128_commuted(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
162; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128_commuted:
163; CHECK:       # %bb.0:
164; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
165; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
166; CHECK-NEXT:    #APP
167; CHECK-NEXT:    nop # encoding: [0x90]
168; CHECK-NEXT:    #NO_APP
169; CHECK-NEXT:    vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
170; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x70,0xd2,0x44,0x24,0xe8]
171; CHECK-NEXT:    retq # encoding: [0xc3]
172  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
173  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %C, <4 x i32> %B)
174  ret <4 x i32> %ret
175}
176
177define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
178; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256:
179; CHECK:       # %bb.0:
180; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
181; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
182; CHECK-NEXT:    #APP
183; CHECK-NEXT:    nop # encoding: [0x90]
184; CHECK-NEXT:    #NO_APP
185; CHECK-NEXT:    vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
186; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x74,0xd2,0x44,0x24,0xd8]
187; CHECK-NEXT:    retq # encoding: [0xc3]
188  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
189  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
190  ret <8 x i32> %ret
191}
192
193define <8 x i32> @test_int_x86_avx2_vpdpwuud_256_commuted(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
194; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256_commuted:
195; CHECK:       # %bb.0:
196; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
197; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
198; CHECK-NEXT:    #APP
199; CHECK-NEXT:    nop # encoding: [0x90]
200; CHECK-NEXT:    #NO_APP
201; CHECK-NEXT:    vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
202; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x74,0xd2,0x44,0x24,0xd8]
203; CHECK-NEXT:    retq # encoding: [0xc3]
204  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
205  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %C, <8 x i32> %B)
206  ret <8 x i32> %ret
207}
208
209define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
210; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128:
211; CHECK:       # %bb.0:
212; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
213; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
214; CHECK-NEXT:    #APP
215; CHECK-NEXT:    nop # encoding: [0x90]
216; CHECK-NEXT:    #NO_APP
217; CHECK-NEXT:    vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
218; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x70,0xd3,0x44,0x24,0xe8]
219; CHECK-NEXT:    retq # encoding: [0xc3]
220  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
221  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
222  ret <4 x i32> %ret
223}
224
225define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128_commuted(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
226; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128_commuted:
227; CHECK:       # %bb.0:
228; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
229; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
230; CHECK-NEXT:    #APP
231; CHECK-NEXT:    nop # encoding: [0x90]
232; CHECK-NEXT:    #NO_APP
233; CHECK-NEXT:    vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
234; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x70,0xd3,0x44,0x24,0xe8]
235; CHECK-NEXT:    retq # encoding: [0xc3]
236  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
237  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %C, <4 x i32> %B)
238  ret <4 x i32> %ret
239}
240
241define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
242; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256:
243; CHECK:       # %bb.0:
244; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
245; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
246; CHECK-NEXT:    #APP
247; CHECK-NEXT:    nop # encoding: [0x90]
248; CHECK-NEXT:    #NO_APP
249; CHECK-NEXT:    vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
250; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x74,0xd3,0x44,0x24,0xd8]
251; CHECK-NEXT:    retq # encoding: [0xc3]
252  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
253  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
254  ret <8 x i32> %ret
255}
256
257define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256_commuted(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
258; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256_commuted:
259; CHECK:       # %bb.0:
260; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
261; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
262; CHECK-NEXT:    #APP
263; CHECK-NEXT:    nop # encoding: [0x90]
264; CHECK-NEXT:    #NO_APP
265; CHECK-NEXT:    vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
266; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x74,0xd3,0x44,0x24,0xd8]
267; CHECK-NEXT:    retq # encoding: [0xc3]
268  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
269  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %C, <8 x i32> %B)
270  ret <8 x i32> %ret
271}
272