xref: /llvm-project/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll (revision 1de3dc7d23dd6b856efad3a3a04f2396328726d7)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
3; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
4; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
5
6target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
7target triple = "aarch64-none-unknown-elf"
8
9define i32 @dotp(ptr %a, ptr %b) #0 {
10; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
11; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
12; CHECK-INTERLEAVE1-NEXT:  entry:
13; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
14; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
15; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
16; CHECK-INTERLEAVE1:       vector.ph:
17; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
18; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
19; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
20; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
21; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
22; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
23; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY1:%.*]]
24; CHECK-INTERLEAVE1:       vector.body:
25; CHECK-INTERLEAVE1-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
26; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP14:%.*]], [[VECTOR_BODY1]] ]
27; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX1]], 0
28; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
29; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
30; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
31; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
32; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
33; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
34; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
35; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
36; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP18]], [[TMP9]]
37; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
38; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]]
39; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
40; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
41; CHECK-INTERLEAVE1:       middle.block:
42; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
43; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
44; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
45; CHECK-INTERLEAVE1:       scalar.ph:
46; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
47; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
48; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
49; CHECK-INTERLEAVE1:       for.body:
50; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
51; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
52; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
53; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
54; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
55; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
56; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
57; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
58; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
59; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
60; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
61; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
62; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
63; CHECK-INTERLEAVE1:       for.exit:
64; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ]
65; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
66;
67; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
68; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
69; CHECK-INTERLEAVED-NEXT:  entry:
70; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
71; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
72; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
73; CHECK-INTERLEAVED:       vector.ph:
74; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
75; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
76; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
77; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
78; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
79; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
80; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY1:%.*]]
81; CHECK-INTERLEAVED:       vector.body:
82; CHECK-INTERLEAVED-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
83; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP23:%.*]], [[VECTOR_BODY1]] ]
84; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP24:%.*]], [[VECTOR_BODY1]] ]
85; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX1]], 0
86; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]]
87; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
88; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
89; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP14]], 4
90; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]]
91; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
92; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
93; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
94; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
95; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
96; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0
97; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
98; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP26]], 4
99; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]]
100; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
101; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
102; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
103; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
104; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
105; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP29]], [[TMP13]]
106; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI]]
107; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
108; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]]
109; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
110; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
111; CHECK-INTERLEAVED:       middle.block:
112; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
113; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
114; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
115; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
116; CHECK-INTERLEAVED:       scalar.ph:
117; CHECK-INTERLEAVED-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
118; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
119; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
120; CHECK-INTERLEAVED:       for.body:
121; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
122; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
123; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
124; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
125; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
126; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
127; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
128; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
129; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
130; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
131; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
132; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
133; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
134; CHECK-INTERLEAVED:       for.exit:
135; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ]
136; CHECK-INTERLEAVED-NEXT:    ret i32 [[ADD_LCSSA]]
137;
138; CHECK-MAXBW-LABEL: define i32 @dotp(
139; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
140; CHECK-MAXBW-NEXT:  entry:
141; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
142; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
143; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
144; CHECK-MAXBW:       vector.ph:
145; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
146; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
147; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
148; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
149; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
150; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
151; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
152; CHECK-MAXBW:       vector.body:
153; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
154; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
155; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
156; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
157; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
158; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
159; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
160; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
161; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
162; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
163; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
164; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
165; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
166; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
167; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
168; CHECK-MAXBW-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
169; CHECK-MAXBW:       middle.block:
170; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE5]])
171; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
172; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
173; CHECK-MAXBW:       scalar.ph:
174; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
175; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
176; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
177; CHECK-MAXBW:       for.body:
178; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
179; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
180; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
181; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
182; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
183; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
184; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
185; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
186; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
187; CHECK-MAXBW-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
188; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
189; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
190; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
191; CHECK-MAXBW:       for.exit:
192; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
193; CHECK-MAXBW-NEXT:    ret i32 [[ADD_LCSSA]]
194;
195entry:
196  br label %for.body
197
198for.body:                                         ; preds = %for.body, %entry
199  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
200  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
201  %gep.a = getelementptr i8, ptr %a, i64 %iv
202  %load.a = load i8, ptr %gep.a, align 1
203  %ext.a = zext i8 %load.a to i32
204  %gep.b = getelementptr i8, ptr %b, i64 %iv
205  %load.b = load i8, ptr %gep.b, align 1
206  %ext.b = zext i8 %load.b to i32
207  %mul = mul i32 %ext.b, %ext.a
208  %add = add i32 %mul, %accum
209  %iv.next = add i64 %iv, 1
210  %exitcond.not = icmp eq i64 %iv.next, 1024
211  br i1 %exitcond.not, label %for.exit, label %for.body
212
213for.exit:                        ; preds = %for.body
214  ret i32 %add
215}
216
217define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
218; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types(
219; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
220; CHECK-INTERLEAVE1-NEXT:  entry:
221; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
222; CHECK-INTERLEAVE1:       vector.ph:
223; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
224; CHECK-INTERLEAVE1:       vector.body:
225; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
226; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
227; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
228; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
229; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
230; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
231; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
232; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
233; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
234; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
235; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
236; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
237; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
238; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
239; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
240; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
241; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
242; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
243; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
244; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
245; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
246; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
247; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
248; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
249; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
250; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
251; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
252; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
253; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
254; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
255; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
256; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
257; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
258; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
259; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
260; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
261; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
262; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
263; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
264; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
265; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
266; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
267; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = load i16, ptr [[TMP23]], align 2
268; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = load i16, ptr [[TMP24]], align 2
269; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP25]], align 2
270; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = load i16, ptr [[TMP26]], align 2
271; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = load i16, ptr [[TMP27]], align 2
272; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = load i16, ptr [[TMP28]], align 2
273; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = load i16, ptr [[TMP29]], align 2
274; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = load i16, ptr [[TMP30]], align 2
275; CHECK-INTERLEAVE1-NEXT:    [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
276; CHECK-INTERLEAVE1-NEXT:    [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
277; CHECK-INTERLEAVE1-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
278; CHECK-INTERLEAVE1-NEXT:    [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
279; CHECK-INTERLEAVE1-NEXT:    [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
280; CHECK-INTERLEAVE1-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
281; CHECK-INTERLEAVE1-NEXT:    [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
282; CHECK-INTERLEAVE1-NEXT:    [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
283; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP41]], i32 4
284; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 5
285; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 6
286; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 7
287; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 8
288; CHECK-INTERLEAVE1-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 9
289; CHECK-INTERLEAVE1-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP55]], i32 10
290; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP56]], i32 11
291; CHECK-INTERLEAVE1-NEXT:    [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
292; CHECK-INTERLEAVE1-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
293; CHECK-INTERLEAVE1-NEXT:    [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
294; CHECK-INTERLEAVE1-NEXT:    [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
295; CHECK-INTERLEAVE1-NEXT:    [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
296; CHECK-INTERLEAVE1-NEXT:    [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
297; CHECK-INTERLEAVE1-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
298; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
299; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
300; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
301; CHECK-INTERLEAVE1:       middle.block:
302; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
303; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
304; CHECK-INTERLEAVE1:       scalar.ph:
305; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ]
306; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
307; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
308; CHECK-INTERLEAVE1:       for.body:
309; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
310; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
311; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
312; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
313; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
314; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
315; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2
316; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32
317; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
318; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
319; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
320; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
321; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
322; CHECK-INTERLEAVE1:       for.exit:
323; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ]
324; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
325;
326; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
327; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
328; CHECK-INTERLEAVED-NEXT:  entry:
329; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
330; CHECK-INTERLEAVED:       vector.ph:
331; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
332; CHECK-INTERLEAVED:       vector.body:
333; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
334; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
335; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
336; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
337; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
338; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
339; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
340; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
341; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
342; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
343; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
344; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
345; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
346; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
347; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
348; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
349; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
350; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
351; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
352; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
353; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 17
354; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 18
355; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 19
356; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 20
357; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 21
358; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 22
359; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 23
360; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], 24
361; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 25
362; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = add i64 [[INDEX]], 26
363; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 27
364; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = add i64 [[INDEX]], 28
365; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 29
366; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
367; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
368; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
369; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
370; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
371; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
372; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
373; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
374; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
375; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
376; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
377; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
378; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
379; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
380; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
381; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
382; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
383; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
384; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
385; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
386; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
387; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
388; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
389; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
390; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
391; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
392; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
393; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
394; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
395; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
396; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
397; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
398; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
399; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
400; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
401; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
402; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
403; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
404; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
405; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
406; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
407; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = load i16, ptr [[TMP39]], align 2
408; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = load i16, ptr [[TMP40]], align 2
409; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = load i16, ptr [[TMP41]], align 2
410; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = load i16, ptr [[TMP42]], align 2
411; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = load i16, ptr [[TMP43]], align 2
412; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = load i16, ptr [[TMP44]], align 2
413; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = load i16, ptr [[TMP45]], align 2
414; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = load i16, ptr [[TMP46]], align 2
415; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = load i16, ptr [[TMP47]], align 2
416; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = load i16, ptr [[TMP48]], align 2
417; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = load i16, ptr [[TMP49]], align 2
418; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = load i16, ptr [[TMP50]], align 2
419; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = load i16, ptr [[TMP51]], align 2
420; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = load i16, ptr [[TMP52]], align 2
421; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = load i16, ptr [[TMP53]], align 2
422; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i16, ptr [[TMP54]], align 2
423; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
424; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
425; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
426; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
427; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
428; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
429; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
430; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
431; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
432; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
433; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
434; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
435; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
436; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
437; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
438; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
439; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP55]], align 2
440; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP56]], align 2
441; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP57]], align 2
442; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP58]], align 2
443; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP59]], align 2
444; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP60]], align 2
445; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP61]], align 2
446; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP62]], align 2
447; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP63]], align 2
448; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP64]], align 2
449; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP65]], align 2
450; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP66]], align 2
451; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP67]], align 2
452; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2
453; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP139]], align 2
454; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP140]], align 2
455; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
456; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
457; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
458; CHECK-INTERLEAVED-NEXT:    [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
459; CHECK-INTERLEAVED-NEXT:    [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
460; CHECK-INTERLEAVED-NEXT:    [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
461; CHECK-INTERLEAVED-NEXT:    [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
462; CHECK-INTERLEAVED-NEXT:    [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
463; CHECK-INTERLEAVED-NEXT:    [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
464; CHECK-INTERLEAVED-NEXT:    [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
465; CHECK-INTERLEAVED-NEXT:    [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
466; CHECK-INTERLEAVED-NEXT:    [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
467; CHECK-INTERLEAVED-NEXT:    [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
468; CHECK-INTERLEAVED-NEXT:    [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
469; CHECK-INTERLEAVED-NEXT:    [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
470; CHECK-INTERLEAVED-NEXT:    [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
471; CHECK-INTERLEAVED-NEXT:    [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
472; CHECK-INTERLEAVED-NEXT:    [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
473; CHECK-INTERLEAVED-NEXT:    [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
474; CHECK-INTERLEAVED-NEXT:    [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
475; CHECK-INTERLEAVED-NEXT:    [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
476; CHECK-INTERLEAVED-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
477; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
478; CHECK-INTERLEAVED-NEXT:    [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
479; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
480; CHECK-INTERLEAVED:       middle.block:
481; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
482; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
483; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
484; CHECK-INTERLEAVED:       scalar.ph:
485; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
486; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP142]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
487; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
488; CHECK-INTERLEAVED:       for.body:
489; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
490; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
491; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
492; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
493; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
494; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
495; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2
496; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32
497; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
498; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
499; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
500; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
501; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
502; CHECK-INTERLEAVED:       for.exit:
503; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP142]], [[MIDDLE_BLOCK]] ]
504; CHECK-INTERLEAVED-NEXT:    ret i32 [[ADD_LCSSA]]
505;
506; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
507; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
508; CHECK-MAXBW-NEXT:  entry:
509; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
510; CHECK-MAXBW:       vector.ph:
511; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
512; CHECK-MAXBW:       vector.body:
513; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
514; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
515; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
516; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
517; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
518; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
519; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
520; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
521; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
522; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
523; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
524; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
525; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
526; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
527; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
528; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
529; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
530; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
531; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
532; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
533; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
534; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
535; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
536; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
537; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
538; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
539; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
540; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
541; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
542; CHECK-MAXBW-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
543; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
544; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
545; CHECK-MAXBW-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
546; CHECK-MAXBW-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
547; CHECK-MAXBW-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
548; CHECK-MAXBW-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
549; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
550; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
551; CHECK-MAXBW-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP37]], align 2
552; CHECK-MAXBW-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP38]], align 2
553; CHECK-MAXBW-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP39]], align 2
554; CHECK-MAXBW-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP40]], align 2
555; CHECK-MAXBW-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP41]], align 2
556; CHECK-MAXBW-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP42]], align 2
557; CHECK-MAXBW-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP43]], align 2
558; CHECK-MAXBW-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP44]], align 2
559; CHECK-MAXBW-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP45]], align 2
560; CHECK-MAXBW-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP46]], align 2
561; CHECK-MAXBW-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP47]], align 2
562; CHECK-MAXBW-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP48]], align 2
563; CHECK-MAXBW-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP49]], align 2
564; CHECK-MAXBW-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP50]], align 2
565; CHECK-MAXBW-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP51]], align 2
566; CHECK-MAXBW-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP52]], align 2
567; CHECK-MAXBW-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
568; CHECK-MAXBW-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
569; CHECK-MAXBW-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
570; CHECK-MAXBW-NEXT:    [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
571; CHECK-MAXBW-NEXT:    [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
572; CHECK-MAXBW-NEXT:    [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
573; CHECK-MAXBW-NEXT:    [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
574; CHECK-MAXBW-NEXT:    [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
575; CHECK-MAXBW-NEXT:    [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
576; CHECK-MAXBW-NEXT:    [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
577; CHECK-MAXBW-NEXT:    [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
578; CHECK-MAXBW-NEXT:    [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
579; CHECK-MAXBW-NEXT:    [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
580; CHECK-MAXBW-NEXT:    [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
581; CHECK-MAXBW-NEXT:    [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
582; CHECK-MAXBW-NEXT:    [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
583; CHECK-MAXBW-NEXT:    [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
584; CHECK-MAXBW-NEXT:    [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
585; CHECK-MAXBW-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
586; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
587; CHECK-MAXBW-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
588; CHECK-MAXBW-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
589; CHECK-MAXBW:       middle.block:
590; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP138]])
591; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
592; CHECK-MAXBW:       scalar.ph:
593; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
594; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
595; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
596; CHECK-MAXBW:       for.body:
597; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
598; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
599; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
600; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
601; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
602; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
603; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2
604; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32
605; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
606; CHECK-MAXBW-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
607; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
608; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
609; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
610; CHECK-MAXBW:       for.exit:
611; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ]
612; CHECK-MAXBW-NEXT:    ret i32 [[ADD_LCSSA]]
613;
614entry:
615  br label %for.body
616
617for.body:                                         ; preds = %for.body, %entry
618  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
619  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
620  %gep.a = getelementptr i8, ptr %a, i64 %iv
621  %load.a = load i8, ptr %gep.a, align 1
622  %ext.a = zext i8 %load.a to i32
623  %gep.b = getelementptr i8, ptr %b, i64 %iv
624  %load.b = load i16, ptr %gep.b, align 2
625  %ext.b = zext i16 %load.b to i32
626  %mul = mul i32 %ext.b, %ext.a
627  %add = add i32 %mul, %accum
628  %iv.next = add i64 %iv, 1
629  %exitcond.not = icmp eq i64 %iv.next, 1024
630  br i1 %exitcond.not, label %for.exit, label %for.body
631
632for.exit:                        ; preds = %for.body
633  ret i32 %add
634}
635
636define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
637; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried(
638; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
639; CHECK-INTERLEAVE1-NEXT:  entry:
640; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
641; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
642; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
643; CHECK-INTERLEAVE1:       vector.ph:
644; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
645; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
646; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
647; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
648; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
649; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
650; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
651; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
652; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
653; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
654; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
655; CHECK-INTERLEAVE1:       vector.body:
656; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
657; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
658; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
659; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
660; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
661; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
662; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
663; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
664; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
665; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
666; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
667; CHECK-INTERLEAVE1-NEXT:    [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
668; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1)
669; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
670; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
671; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
672; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
673; CHECK-INTERLEAVE1:       middle.block:
674; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
675; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], 8
676; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
677; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 8 x i32> [[TMP18]], i32 [[TMP22]]
678; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
679; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = mul i32 [[TMP24]], 8
680; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP25]], 1
681; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP16]], i32 [[TMP26]]
682; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
683; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
684; CHECK-INTERLEAVE1:       scalar.ph:
685; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
686; CHECK-INTERLEAVE1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
687; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
688; CHECK-INTERLEAVE1:       for.body:
689; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
690; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
691; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
692; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
693; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
694; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
695; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
696; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
697; CHECK-INTERLEAVE1-NEXT:    [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]]
698; CHECK-INTERLEAVE1-NEXT:    [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]]
699; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
700; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
701; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
702; CHECK-INTERLEAVE1:       for.exit:
703; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
704; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
705;
706; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
707; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
708; CHECK-INTERLEAVED-NEXT:  entry:
709; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
710; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
711; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
712; CHECK-INTERLEAVED:       vector.ph:
713; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
714; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
715; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
716; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
717; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
718; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
719; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
720; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
721; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
722; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
723; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
724; CHECK-INTERLEAVED:       vector.body:
725; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
726; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
727; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
728; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
729; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
730; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
731; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
732; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
733; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
734; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
735; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
736; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
737; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
738; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
739; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
740; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 8
741; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
742; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
743; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
744; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
745; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
746; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
747; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
748; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
749; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
750; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
751; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
752; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
753; CHECK-INTERLEAVED:       middle.block:
754; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
755; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 8
756; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
757; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP31]]
758; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
759; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul i32 [[TMP33]], 8
760; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], 1
761; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP35]]
762; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
763; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
764; CHECK-INTERLEAVED:       scalar.ph:
765; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
766; CHECK-INTERLEAVED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
767; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
768; CHECK-INTERLEAVED:       for.body:
769; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
770; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
771; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
772; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
773; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
774; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
775; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
776; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
777; CHECK-INTERLEAVED-NEXT:    [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]]
778; CHECK-INTERLEAVED-NEXT:    [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]]
779; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
780; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
781; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
782; CHECK-INTERLEAVED:       for.exit:
783; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ]
784; CHECK-INTERLEAVED-NEXT:    ret i32 [[ADD_LCSSA]]
785;
786; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
787; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
788; CHECK-MAXBW-NEXT:  entry:
789; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
790; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
791; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
792; CHECK-MAXBW:       vector.ph:
793; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
794; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
795; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
796; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
797; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
798; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
799; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
800; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
801; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
802; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
803; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
804; CHECK-MAXBW:       vector.body:
805; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
806; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
807; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
808; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
809; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
810; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
811; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
812; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
813; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
814; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
815; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
816; CHECK-MAXBW-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
817; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP25]], i32 -1)
818; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
819; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
820; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
821; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
822; CHECK-MAXBW:       middle.block:
823; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
824; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], 8
825; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
826; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP22]]
827; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
828; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP24]], 8
829; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
830; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP31]]
831; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
832; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
833; CHECK-MAXBW:       scalar.ph:
834; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
835; CHECK-MAXBW-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
836; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
837; CHECK-MAXBW:       for.body:
838; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
839; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
840; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
841; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
842; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
843; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
844; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
845; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
846; CHECK-MAXBW-NEXT:    [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]]
847; CHECK-MAXBW-NEXT:    [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]]
848; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
849; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
850; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
851; CHECK-MAXBW:       for.exit:
852; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
853; CHECK-MAXBW-NEXT:    ret i32 [[ADD_LCSSA]]
854;
855entry:
856  br label %for.body
857
858for.body:                                         ; preds = %for.body, %entry
859  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
860  %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ]
861  %gep.a = getelementptr i8, ptr %a, i64 %iv
862  %load.a = load i8, ptr %gep.a, align 1
863  %ext.a = zext i8 %load.a to i32
864  %gep.b = getelementptr i8, ptr %b, i64 %iv
865  %load.b = load i8, ptr %gep.b, align 1
866  %ext.b = zext i8 %load.b to i32
867  %mul = mul i32 %ext.b, %ext.a
868  %add = add i32 %mul, %accum
869  %iv.next = add i64 %iv, 1
870  %exitcond.not = icmp eq i64 %iv.next, 1024
871  br i1 %exitcond.not, label %for.exit, label %for.body
872
873for.exit:                        ; preds = %for.body
874  ret i32 %add
875}
876
877define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
878; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi(
879; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
880; CHECK-INTERLEAVE1-NEXT:  entry:
881; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
882; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
883; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
884; CHECK-INTERLEAVE1:       vector.ph:
885; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
886; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
887; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
888; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
889; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
890; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
891; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
892; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
893; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
894; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
895; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
896; CHECK-INTERLEAVE1:       vector.body:
897; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
898; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
899; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
900; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
901; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
902; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
903; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
904; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
905; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
906; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
907; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
908; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
909; CHECK-INTERLEAVE1-NEXT:    [[TMP17]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
910; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
911; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
912; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
913; CHECK-INTERLEAVE1:       middle.block:
914; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vscale.i32()
915; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], 8
916; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = sub i32 [[TMP20]], 1
917; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 8 x i32> [[TMP17]], i32 [[TMP21]]
918; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
919; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = mul i32 [[TMP23]], 8
920; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP24]], 1
921; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP17]], i32 [[TMP25]]
922; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
923; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
924; CHECK-INTERLEAVE1:       scalar.ph:
925; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
926; CHECK-INTERLEAVE1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
927; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
928; CHECK-INTERLEAVE1:       for.body:
929; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
930; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
931; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
932; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
933; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
934; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
935; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
936; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
937; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
938; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[EXT_B]]
939; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
940; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
941; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
942; CHECK-INTERLEAVE1:       for.exit:
943; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
944; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
945;
946; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
947; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
948; CHECK-INTERLEAVED-NEXT:  entry:
949; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
950; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
951; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
952; CHECK-INTERLEAVED:       vector.ph:
953; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
954; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
955; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
956; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
957; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
958; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
959; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
960; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
961; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
962; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
963; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
964; CHECK-INTERLEAVED:       vector.body:
965; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
966; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
967; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
968; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
969; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
970; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
971; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
972; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
973; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
974; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
975; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
976; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
977; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]]
978; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
979; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
980; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
981; CHECK-INTERLEAVED-NEXT:    [[TMP21]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
982; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
983; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
984; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
985; CHECK-INTERLEAVED:       middle.block:
986; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
987; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP23]], 8
988; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP31]], 1
989; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 8 x i32> [[TMP21]], i32 [[TMP25]]
990; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
991; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], 8
992; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP28]], 1
993; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP21]], i32 [[TMP29]]
994; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
995; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
996; CHECK-INTERLEAVED:       scalar.ph:
997; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
998; CHECK-INTERLEAVED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
999; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
1000; CHECK-INTERLEAVED:       for.body:
1001; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
1002; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
1003; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
1004; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
1005; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
1006; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
1007; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
1008; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
1009; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
1010; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i32 [[MUL]], [[EXT_B]]
1011; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
1012; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
1013; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
1014; CHECK-INTERLEAVED:       for.exit:
1015; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
1016; CHECK-INTERLEAVED-NEXT:    ret i32 [[ADD_LCSSA]]
1017;
1018; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
1019; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
1020; CHECK-MAXBW-NEXT:  entry:
1021; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
1022; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
1023; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1024; CHECK-MAXBW:       vector.ph:
1025; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
1026; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
1027; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
1028; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
1029; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
1030; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
1031; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
1032; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
1033; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
1034; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
1035; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
1036; CHECK-MAXBW:       vector.body:
1037; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1038; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
1039; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
1040; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
1041; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
1042; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
1043; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
1044; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
1045; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
1046; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
1047; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
1048; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP14]]
1049; CHECK-MAXBW-NEXT:    [[TMP21]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
1050; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
1051; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1052; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
1053; CHECK-MAXBW:       middle.block:
1054; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
1055; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 8
1056; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP27]], 1
1057; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i32> [[TMP21]], i32 [[TMP28]]
1058; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
1059; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = mul i32 [[TMP23]], 8
1060; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP24]], 1
1061; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP21]], i32 [[TMP25]]
1062; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
1063; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
1064; CHECK-MAXBW:       scalar.ph:
1065; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1066; CHECK-MAXBW-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1067; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
1068; CHECK-MAXBW:       for.body:
1069; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
1070; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
1071; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
1072; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
1073; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
1074; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
1075; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
1076; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
1077; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
1078; CHECK-MAXBW-NEXT:    [[ADD]] = add i32 [[MUL]], [[EXT_B]]
1079; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
1080; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
1081; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
1082; CHECK-MAXBW:       for.exit:
1083; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
1084; CHECK-MAXBW-NEXT:    ret i32 [[ADD_LCSSA]]
1085;
1086entry:
1087  br label %for.body
1088
1089for.body:                                         ; preds = %for.body, %entry
1090  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1091  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
1092  %gep.a = getelementptr i8, ptr %a, i64 %iv
1093  %load.a = load i8, ptr %gep.a, align 1
1094  %ext.a = zext i8 %load.a to i32
1095  %gep.b = getelementptr i8, ptr %b, i64 %iv
1096  %load.b = load i8, ptr %gep.b, align 1
1097  %ext.b = zext i8 %load.b to i32
1098  %mul = mul i32 %ext.b, %ext.a
1099  %add = add i32 %mul, %ext.b
1100  %iv.next = add i64 %iv, 1
1101  %exitcond.not = icmp eq i64 %iv.next, 1024
1102  br i1 %exitcond.not, label %for.exit, label %for.body
1103
1104for.exit:                        ; preds = %for.body
1105  ret i32 %add
1106}
1107
1108define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
1109; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
1110; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
1111; CHECK-INTERLEAVE1-NEXT:  entry:
1112; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
1113; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], 4
1114; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
1115; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1116; CHECK-INTERLEAVE1:       vector.ph:
1117; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
1118; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], 4
1119; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
1120; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
1121; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
1122; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP20]], 4
1123; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
1124; CHECK-INTERLEAVE1:       vector.body:
1125; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1126; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
1127; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
1128; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
1129; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
1130; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
1131; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
1132; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
1133; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
1134; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
1135; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
1136; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
1137; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
1138; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
1139; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
1140; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
1141; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
1142; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
1143; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
1144; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
1145; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
1146; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
1147; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
1148; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = mul nsw <vscale x 4 x i32> [[TMP21]], [[TMP36]]
1149; CHECK-INTERLEAVE1-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP38]], [[VEC_PHI3]]
1150; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
1151; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
1152; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
1153; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
1154; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP19]], align 1
1155; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32>
1156; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = mul nsw <vscale x 4 x i32> [[TMP25]], [[TMP42]]
1157; CHECK-INTERLEAVE1-NEXT:    [[TMP30]] = add <vscale x 4 x i32> [[TMP28]], [[VEC_PHI2]]
1158; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
1159; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
1160; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD7]] to <vscale x 4 x i32>
1161; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
1162; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
1163; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
1164; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = mul nsw <vscale x 4 x i32> [[TMP31]], [[TMP33]]
1165; CHECK-INTERLEAVE1-NEXT:    [[TMP35]] = add <vscale x 4 x i32> [[TMP34]], [[VEC_PHI1]]
1166; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
1167; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
1168; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
1169; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
1170; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP29]], align 1
1171; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
1172; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = mul nsw <vscale x 4 x i32> [[TMP37]], [[TMP39]]
1173; CHECK-INTERLEAVE1-NEXT:    [[TMP41]] = add <vscale x 4 x i32> [[TMP40]], [[VEC_PHI]]
1174; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]]
1175; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1176; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
1177; CHECK-INTERLEAVE1:       middle.block:
1178; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP41]])
1179; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP35]])
1180; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP30]])
1181; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP23]])
1182; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
1183; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
1184; CHECK-INTERLEAVE1:       scalar.ph:
1185; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1186; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP43]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1187; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX11:%.*]] = phi i32 [ [[TMP44]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1188; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX12:%.*]] = phi i32 [ [[TMP45]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1189; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP46]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1190; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
1191; CHECK-INTERLEAVE1:       for.body:
1192; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
1193; CHECK-INTERLEAVE1-NEXT:    [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ]
1194; CHECK-INTERLEAVE1-NEXT:    [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX11]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ]
1195; CHECK-INTERLEAVE1-NEXT:    [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX12]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ]
1196; CHECK-INTERLEAVE1-NEXT:    [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ]
1197; CHECK-INTERLEAVE1-NEXT:    [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
1198; CHECK-INTERLEAVE1-NEXT:    [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
1199; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1
1200; CHECK-INTERLEAVE1-NEXT:    [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]]
1201; CHECK-INTERLEAVE1-NEXT:    [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]]
1202; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2
1203; CHECK-INTERLEAVE1-NEXT:    [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]]
1204; CHECK-INTERLEAVE1-NEXT:    [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]]
1205; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3
1206; CHECK-INTERLEAVE1-NEXT:    [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]]
1207; CHECK-INTERLEAVE1-NEXT:    [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]]
1208; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1
1209; CHECK-INTERLEAVE1-NEXT:    [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32
1210; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1
1211; CHECK-INTERLEAVE1-NEXT:    [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32
1212; CHECK-INTERLEAVE1-NEXT:    [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]]
1213; CHECK-INTERLEAVE1-NEXT:    [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]]
1214; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1
1215; CHECK-INTERLEAVE1-NEXT:    [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32
1216; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1
1217; CHECK-INTERLEAVE1-NEXT:    [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32
1218; CHECK-INTERLEAVE1-NEXT:    [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]]
1219; CHECK-INTERLEAVE1-NEXT:    [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]]
1220; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1
1221; CHECK-INTERLEAVE1-NEXT:    [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32
1222; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1
1223; CHECK-INTERLEAVE1-NEXT:    [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32
1224; CHECK-INTERLEAVE1-NEXT:    [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]]
1225; CHECK-INTERLEAVE1-NEXT:    [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]]
1226; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1
1227; CHECK-INTERLEAVE1-NEXT:    [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32
1228; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1
1229; CHECK-INTERLEAVE1-NEXT:    [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32
1230; CHECK-INTERLEAVE1-NEXT:    [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]]
1231; CHECK-INTERLEAVE1-NEXT:    [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]]
1232; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
1233; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]]
1234; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
1235; CHECK-INTERLEAVE1:       exit:
1236; CHECK-INTERLEAVE1-NEXT:    [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP46]], [[MIDDLE_BLOCK]] ]
1237; CHECK-INTERLEAVE1-NEXT:    [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ]
1238; CHECK-INTERLEAVE1-NEXT:    [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP44]], [[MIDDLE_BLOCK]] ]
1239; CHECK-INTERLEAVE1-NEXT:    [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP43]], [[MIDDLE_BLOCK]] ]
1240; CHECK-INTERLEAVE1-NEXT:    [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]]
1241; CHECK-INTERLEAVE1-NEXT:    [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]]
1242; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]]
1243; CHECK-INTERLEAVE1-NEXT:    ret i32 [[RESULT]]
1244;
1245; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
1246; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
1247; CHECK-INTERLEAVED-NEXT:  entry:
1248; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
1249; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], 8
1250; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
1251; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1252; CHECK-INTERLEAVED:       vector.ph:
1253; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
1254; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], 8
1255; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
1256; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
1257; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
1258; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP34]], 8
1259; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
1260; CHECK-INTERLEAVED:       vector.body:
1261; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1262; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
1263; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
1264; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ]
1265; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ]
1266; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ]
1267; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
1268; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
1269; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
1270; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
1271; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
1272; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
1273; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
1274; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
1275; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
1276; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
1277; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
1278; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
1279; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
1280; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
1281; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
1282; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
1283; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
1284; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP56]], 4
1285; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]]
1286; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
1287; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
1288; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
1289; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
1290; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
1291; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
1292; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
1293; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]]
1294; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
1295; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP72]], align 1
1296; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
1297; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
1298; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nsw <vscale x 4 x i32> [[TMP28]], [[TMP66]]
1299; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[TMP82]], [[TMP23]]
1300; CHECK-INTERLEAVED-NEXT:    [[TMP50]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI6]]
1301; CHECK-INTERLEAVED-NEXT:    [[TMP33]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
1302; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
1303; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
1304; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = mul i64 [[TMP35]], 4
1305; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]]
1306; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
1307; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 4 x i8>, ptr [[TMP37]], align 1
1308; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i32>
1309; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD12]] to <vscale x 4 x i32>
1310; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
1311; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
1312; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 4
1313; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]]
1314; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 4 x i8>, ptr [[TMP19]], align 1
1315; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 4 x i8>, ptr [[TMP43]], align 1
1316; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD13]] to <vscale x 4 x i32>
1317; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD14]] to <vscale x 4 x i32>
1318; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = mul nsw <vscale x 4 x i32> [[TMP38]], [[TMP44]]
1319; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = mul nsw <vscale x 4 x i32> [[TMP39]], [[TMP45]]
1320; CHECK-INTERLEAVED-NEXT:    [[TMP48]] = add <vscale x 4 x i32> [[TMP46]], [[VEC_PHI4]]
1321; CHECK-INTERLEAVED-NEXT:    [[TMP49]] = add <vscale x 4 x i32> [[TMP47]], [[VEC_PHI5]]
1322; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
1323; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
1324; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 4
1325; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]]
1326; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
1327; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x i8>, ptr [[TMP53]], align 1
1328; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD15]] to <vscale x 4 x i32>
1329; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD16]] to <vscale x 4 x i32>
1330; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
1331; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
1332; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP57]], 4
1333; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]]
1334; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD17:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
1335; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1
1336; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD17]] to <vscale x 4 x i32>
1337; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD18]] to <vscale x 4 x i32>
1338; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = mul nsw <vscale x 4 x i32> [[TMP54]], [[TMP60]]
1339; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = mul nsw <vscale x 4 x i32> [[TMP55]], [[TMP61]]
1340; CHECK-INTERLEAVED-NEXT:    [[TMP64]] = add <vscale x 4 x i32> [[TMP62]], [[VEC_PHI2]]
1341; CHECK-INTERLEAVED-NEXT:    [[TMP65]] = add <vscale x 4 x i32> [[TMP63]], [[VEC_PHI3]]
1342; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
1343; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = call i64 @llvm.vscale.i64()
1344; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = mul i64 [[TMP67]], 4
1345; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]]
1346; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
1347; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 4 x i8>, ptr [[TMP69]], align 1
1348; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD19]] to <vscale x 4 x i32>
1349; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD20]] to <vscale x 4 x i32>
1350; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
1351; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = call i64 @llvm.vscale.i64()
1352; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = mul i64 [[TMP73]], 4
1353; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]]
1354; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <vscale x 4 x i8>, ptr [[TMP29]], align 1
1355; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP75]], align 1
1356; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD21]] to <vscale x 4 x i32>
1357; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD22]] to <vscale x 4 x i32>
1358; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = mul nsw <vscale x 4 x i32> [[TMP70]], [[TMP76]]
1359; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = mul nsw <vscale x 4 x i32> [[TMP71]], [[TMP77]]
1360; CHECK-INTERLEAVED-NEXT:    [[TMP80]] = add <vscale x 4 x i32> [[TMP78]], [[VEC_PHI]]
1361; CHECK-INTERLEAVED-NEXT:    [[TMP81]] = add <vscale x 4 x i32> [[TMP79]], [[VEC_PHI1]]
1362; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP40]]
1363; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1364; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
1365; CHECK-INTERLEAVED:       middle.block:
1366; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP81]], [[TMP80]]
1367; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
1368; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX23:%.*]] = add <vscale x 4 x i32> [[TMP65]], [[TMP64]]
1369; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX23]])
1370; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX24:%.*]] = add <vscale x 4 x i32> [[TMP49]], [[TMP48]]
1371; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX24]])
1372; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX25:%.*]] = add <vscale x 4 x i32> [[TMP33]], [[TMP50]]
1373; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX25]])
1374; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
1375; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
1376; CHECK-INTERLEAVED:       scalar.ph:
1377; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1378; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP83]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1379; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX26:%.*]] = phi i32 [ [[TMP84]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1380; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX27:%.*]] = phi i32 [ [[TMP85]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1381; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX28:%.*]] = phi i32 [ [[TMP86]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1382; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
1383; CHECK-INTERLEAVED:       for.body:
1384; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
1385; CHECK-INTERLEAVED-NEXT:    [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ]
1386; CHECK-INTERLEAVED-NEXT:    [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX26]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ]
1387; CHECK-INTERLEAVED-NEXT:    [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX27]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ]
1388; CHECK-INTERLEAVED-NEXT:    [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX28]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ]
1389; CHECK-INTERLEAVED-NEXT:    [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
1390; CHECK-INTERLEAVED-NEXT:    [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
1391; CHECK-INTERLEAVED-NEXT:    [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1
1392; CHECK-INTERLEAVED-NEXT:    [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]]
1393; CHECK-INTERLEAVED-NEXT:    [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]]
1394; CHECK-INTERLEAVED-NEXT:    [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2
1395; CHECK-INTERLEAVED-NEXT:    [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]]
1396; CHECK-INTERLEAVED-NEXT:    [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]]
1397; CHECK-INTERLEAVED-NEXT:    [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3
1398; CHECK-INTERLEAVED-NEXT:    [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]]
1399; CHECK-INTERLEAVED-NEXT:    [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]]
1400; CHECK-INTERLEAVED-NEXT:    [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1
1401; CHECK-INTERLEAVED-NEXT:    [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32
1402; CHECK-INTERLEAVED-NEXT:    [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1
1403; CHECK-INTERLEAVED-NEXT:    [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32
1404; CHECK-INTERLEAVED-NEXT:    [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]]
1405; CHECK-INTERLEAVED-NEXT:    [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]]
1406; CHECK-INTERLEAVED-NEXT:    [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1
1407; CHECK-INTERLEAVED-NEXT:    [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32
1408; CHECK-INTERLEAVED-NEXT:    [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1
1409; CHECK-INTERLEAVED-NEXT:    [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32
1410; CHECK-INTERLEAVED-NEXT:    [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]]
1411; CHECK-INTERLEAVED-NEXT:    [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]]
1412; CHECK-INTERLEAVED-NEXT:    [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1
1413; CHECK-INTERLEAVED-NEXT:    [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32
1414; CHECK-INTERLEAVED-NEXT:    [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1
1415; CHECK-INTERLEAVED-NEXT:    [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32
1416; CHECK-INTERLEAVED-NEXT:    [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]]
1417; CHECK-INTERLEAVED-NEXT:    [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]]
1418; CHECK-INTERLEAVED-NEXT:    [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1
1419; CHECK-INTERLEAVED-NEXT:    [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32
1420; CHECK-INTERLEAVED-NEXT:    [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1
1421; CHECK-INTERLEAVED-NEXT:    [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32
1422; CHECK-INTERLEAVED-NEXT:    [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]]
1423; CHECK-INTERLEAVED-NEXT:    [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]]
1424; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
1425; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]]
1426; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
1427; CHECK-INTERLEAVED:       exit:
1428; CHECK-INTERLEAVED-NEXT:    [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ]
1429; CHECK-INTERLEAVED-NEXT:    [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ]
1430; CHECK-INTERLEAVED-NEXT:    [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP84]], [[MIDDLE_BLOCK]] ]
1431; CHECK-INTERLEAVED-NEXT:    [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP83]], [[MIDDLE_BLOCK]] ]
1432; CHECK-INTERLEAVED-NEXT:    [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]]
1433; CHECK-INTERLEAVED-NEXT:    [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]]
1434; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]]
1435; CHECK-INTERLEAVED-NEXT:    ret i32 [[RESULT]]
1436;
1437; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled(
1438; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
1439; CHECK-MAXBW-NEXT:  entry:
1440; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
1441; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
1442; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]]
1443; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1444; CHECK-MAXBW:       vector.ph:
1445; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
1446; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
1447; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]]
1448; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
1449; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
1450; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
1451; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
1452; CHECK-MAXBW:       vector.body:
1453; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1454; CHECK-MAXBW-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
1455; CHECK-MAXBW-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
1456; CHECK-MAXBW-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
1457; CHECK-MAXBW-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
1458; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
1459; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
1460; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
1461; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP6]], 1
1462; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
1463; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
1464; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[TMP6]], 2
1465; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
1466; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
1467; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = or disjoint i64 [[TMP6]], 3
1468; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
1469; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
1470; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
1471; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
1472; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
1473; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
1474; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x i8>, ptr [[TMP24]], align 1
1475; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD9]] to <vscale x 8 x i32>
1476; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 8 x i32> [[TMP29]], [[TMP23]]
1477; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE11]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI7]], <vscale x 8 x i32> [[TMP31]])
1478; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
1479; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 8 x i8>, ptr [[TMP32]], align 1
1480; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD12]] to <vscale x 8 x i32>
1481; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
1482; CHECK-MAXBW-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i8>, ptr [[TMP38]], align 1
1483; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD14]] to <vscale x 8 x i32>
1484; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = mul nsw <vscale x 8 x i32> [[TMP37]], [[TMP43]]
1485; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI6]], <vscale x 8 x i32> [[TMP45]])
1486; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
1487; CHECK-MAXBW-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 8 x i8>, ptr [[TMP46]], align 1
1488; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD18]] to <vscale x 8 x i32>
1489; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
1490; CHECK-MAXBW-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 8 x i8>, ptr [[TMP52]], align 1
1491; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD20]] to <vscale x 8 x i32>
1492; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = mul nsw <vscale x 8 x i32> [[TMP51]], [[TMP57]]
1493; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI5]], <vscale x 8 x i32> [[TMP59]])
1494; CHECK-MAXBW-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
1495; CHECK-MAXBW-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 8 x i8>, ptr [[TMP60]], align 1
1496; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD24]] to <vscale x 8 x i32>
1497; CHECK-MAXBW-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0
1498; CHECK-MAXBW-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 8 x i8>, ptr [[TMP66]], align 1
1499; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD26]] to <vscale x 8 x i32>
1500; CHECK-MAXBW-NEXT:    [[TMP73:%.*]] = mul nsw <vscale x 8 x i32> [[TMP65]], [[TMP71]]
1501; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]])
1502; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
1503; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1504; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
1505; CHECK-MAXBW:       middle.block:
1506; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE16]])
1507; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE17]])
1508; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]])
1509; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE11]])
1510; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
1511; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
1512; CHECK-MAXBW:       scalar.ph:
1513; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1514; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP39]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1515; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP40]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1516; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX15:%.*]] = phi i32 [ [[TMP41]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1517; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX16:%.*]] = phi i32 [ [[TMP42]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1518; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
1519; CHECK-MAXBW:       for.body:
1520; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
1521; CHECK-MAXBW-NEXT:    [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ]
1522; CHECK-MAXBW-NEXT:    [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX14]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ]
1523; CHECK-MAXBW-NEXT:    [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX15]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ]
1524; CHECK-MAXBW-NEXT:    [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX16]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ]
1525; CHECK-MAXBW-NEXT:    [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
1526; CHECK-MAXBW-NEXT:    [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
1527; CHECK-MAXBW-NEXT:    [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1
1528; CHECK-MAXBW-NEXT:    [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]]
1529; CHECK-MAXBW-NEXT:    [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]]
1530; CHECK-MAXBW-NEXT:    [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2
1531; CHECK-MAXBW-NEXT:    [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]]
1532; CHECK-MAXBW-NEXT:    [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]]
1533; CHECK-MAXBW-NEXT:    [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3
1534; CHECK-MAXBW-NEXT:    [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]]
1535; CHECK-MAXBW-NEXT:    [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]]
1536; CHECK-MAXBW-NEXT:    [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1
1537; CHECK-MAXBW-NEXT:    [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32
1538; CHECK-MAXBW-NEXT:    [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1
1539; CHECK-MAXBW-NEXT:    [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32
1540; CHECK-MAXBW-NEXT:    [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]]
1541; CHECK-MAXBW-NEXT:    [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]]
1542; CHECK-MAXBW-NEXT:    [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1
1543; CHECK-MAXBW-NEXT:    [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32
1544; CHECK-MAXBW-NEXT:    [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1
1545; CHECK-MAXBW-NEXT:    [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32
1546; CHECK-MAXBW-NEXT:    [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]]
1547; CHECK-MAXBW-NEXT:    [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]]
1548; CHECK-MAXBW-NEXT:    [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1
1549; CHECK-MAXBW-NEXT:    [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32
1550; CHECK-MAXBW-NEXT:    [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1
1551; CHECK-MAXBW-NEXT:    [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32
1552; CHECK-MAXBW-NEXT:    [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]]
1553; CHECK-MAXBW-NEXT:    [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]]
1554; CHECK-MAXBW-NEXT:    [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1
1555; CHECK-MAXBW-NEXT:    [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32
1556; CHECK-MAXBW-NEXT:    [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1
1557; CHECK-MAXBW-NEXT:    [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32
1558; CHECK-MAXBW-NEXT:    [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]]
1559; CHECK-MAXBW-NEXT:    [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]]
1560; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
1561; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]]
1562; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
1563; CHECK-MAXBW:       exit:
1564; CHECK-MAXBW-NEXT:    [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP42]], [[MIDDLE_BLOCK]] ]
1565; CHECK-MAXBW-NEXT:    [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
1566; CHECK-MAXBW-NEXT:    [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP40]], [[MIDDLE_BLOCK]] ]
1567; CHECK-MAXBW-NEXT:    [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
1568; CHECK-MAXBW-NEXT:    [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]]
1569; CHECK-MAXBW-NEXT:    [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]]
1570; CHECK-MAXBW-NEXT:    [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]]
1571; CHECK-MAXBW-NEXT:    ret i32 [[RESULT]]
1572;
1573entry:
1574  br label %for.body
1575
1576for.body:                                    ; preds = %entry, %for.body
1577  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1578  %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ]
1579  %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ]
1580  %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ]
1581  %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ]
1582  %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv
1583  %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv
1584  %offset.1 = or disjoint i64 %iv, 1
1585  %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1
1586  %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1
1587  %offset.2 = or disjoint i64 %iv, 2
1588  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2
1589  %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2
1590  %offset.3 = or disjoint i64 %iv, 3
1591  %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3
1592  %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3
1593  %load.a0 = load i8, ptr %gep.a0, align 1
1594  %ext.a0 = sext i8 %load.a0 to i32
1595  %load.b0 = load i8, ptr %gep.b0, align 1
1596  %ext.b0 = sext i8 %load.b0 to i32
1597  %mul.a0 = mul nsw i32 %ext.b0, %ext.a0
1598  %add.a0 = add nsw i32 %mul.a0, %accum0
1599  %load.a1 = load i8, ptr %gep.a1, align 1
1600  %ext.a1 = sext i8 %load.a1 to i32
1601  %load.b1 = load i8, ptr %gep.b1, align 1
1602  %ext.b1 = sext i8 %load.b1 to i32
1603  %mul.a1 = mul nsw i32 %ext.a1, %ext.b1
1604  %add.a1 = add nsw i32 %mul.a1, %accum1
1605  %load.a2 = load i8, ptr %gep.a2, align 1
1606  %ext.a2 = sext i8 %load.a2 to i32
1607  %load.b2 = load i8, ptr %gep.b2, align 1
1608  %ext.b2 = sext i8 %load.b2 to i32
1609  %mul.a2 = mul nsw i32 %ext.a2, %ext.b2
1610  %add.a2 = add nsw i32 %mul.a2, %accum2
1611  %load.a3 = load i8, ptr %gep.a3, align 1
1612  %ext.a3 = sext i8 %load.a3 to i32
1613  %load.b3 = load i8, ptr %gep.b3, align 1
1614  %ext.b3 = sext i8 %load.b3 to i32
1615  %mul.a3 = mul nsw i32 %ext.a3, %ext.b3
1616  %add.a3 = add nsw i32 %mul.a3, %accum3
1617  %iv.next = add nuw nsw i64 %iv, 1
1618  %exitcond.not = icmp eq i64 %iv.next, %num_in
1619  br i1 %exitcond.not, label %exit, label %for.body
1620
1621exit:                                        ; preds = %for.body
1622  %result0 = add nsw i32 %add.a0, %add.a1
1623  %result1 = add nsw i32 %add.a2, %add.a3
1624  %result = add nsw i32 %result0, %result1
1625  ret i32 %result
1626}
1627
1628define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
1629; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
1630; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
1631; CHECK-INTERLEAVE1-NEXT:  entry:
1632; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
1633; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
1634; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP7]]
1635; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
1636; CHECK-INTERLEAVE1:       vector.ph:
1637; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
1638; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP10]], 4
1639; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
1640; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
1641; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
1642; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP11]], 4
1643; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
1644; CHECK-INTERLEAVE1:       vector.body:
1645; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1646; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
1647; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
1648; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
1649; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
1650; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
1651; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
1652; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
1653; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
1654; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
1655; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
1656; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul nsw <vscale x 4 x i32> [[TMP12]], [[TMP9]]
1657; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
1658; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
1659; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1660; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
1661; CHECK-INTERLEAVE1:       middle.block:
1662; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
1663; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
1664; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
1665; CHECK-INTERLEAVE1:       scalar.ph:
1666; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1667; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1668; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
1669; CHECK-INTERLEAVE1:       for.body:
1670; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
1671; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
1672; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
1673; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
1674; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
1675; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
1676; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
1677; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
1678; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
1679; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
1680; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
1681; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
1682; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
1683; CHECK-INTERLEAVE1:       exit:
1684; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
1685; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
1686;
1687; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
1688; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
1689; CHECK-INTERLEAVED-NEXT:  entry:
1690; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
1691; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP8]], 8
1692; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP14]]
1693; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
1694; CHECK-INTERLEAVED:       vector.ph:
1695; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
1696; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP15]], 8
1697; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
1698; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
1699; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
1700; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
1701; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
1702; CHECK-INTERLEAVED:       vector.body:
1703; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1704; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
1705; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
1706; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
1707; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
1708; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
1709; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
1710; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
1711; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP10]]
1712; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
1713; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
1714; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
1715; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
1716; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
1717; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
1718; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
1719; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
1720; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP17]]
1721; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
1722; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
1723; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
1724; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
1725; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw <vscale x 4 x i32> [[TMP19]], [[TMP12]]
1726; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nsw <vscale x 4 x i32> [[TMP20]], [[TMP25]]
1727; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
1728; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
1729; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
1730; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1731; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
1732; CHECK-INTERLEAVED:       middle.block:
1733; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
1734; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
1735; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
1736; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
1737; CHECK-INTERLEAVED:       scalar.ph:
1738; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1739; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP26]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1740; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
1741; CHECK-INTERLEAVED:       for.body:
1742; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
1743; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
1744; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
1745; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
1746; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
1747; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
1748; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
1749; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
1750; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
1751; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
1752; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
1753; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
1754; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
1755; CHECK-INTERLEAVED:       exit:
1756; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
1757; CHECK-INTERLEAVED-NEXT:    ret i32 [[ADD_LCSSA]]
1758;
1759; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated(
1760; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
1761; CHECK-MAXBW-NEXT:  entry:
1762; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
1763; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
1764; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
1765; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1766; CHECK-MAXBW:       vector.ph:
1767; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
1768; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
1769; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
1770; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
1771; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
1772; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
1773; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
1774; CHECK-MAXBW:       vector.body:
1775; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1776; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
1777; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
1778; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
1779; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
1780; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
1781; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
1782; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
1783; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
1784; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
1785; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
1786; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul nsw <vscale x 8 x i32> [[TMP20]], [[TMP13]]
1787; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
1788; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
1789; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1790; CHECK-MAXBW-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
1791; CHECK-MAXBW:       middle.block:
1792; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE5]])
1793; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
1794; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
1795; CHECK-MAXBW:       scalar.ph:
1796; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1797; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1798; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
1799; CHECK-MAXBW:       for.body:
1800; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
1801; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
1802; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
1803; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
1804; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
1805; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
1806; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
1807; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
1808; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
1809; CHECK-MAXBW-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
1810; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
1811; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
1812; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
1813; CHECK-MAXBW:       exit:
1814; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
1815; CHECK-MAXBW-NEXT:    ret i32 [[ADD_LCSSA]]
1816;
1817entry:
1818  br label %for.body
1819
1820for.body:                                         ; preds = %entry, %for.body
1821  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1822  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
1823  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
1824  %load.a = load i8, ptr %gep.a, align 1
1825  %ext.a = sext i8 %load.a to i32
1826  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
1827  %load.b = load i8, ptr %gep.b, align 1
1828  %ext.b = sext i8 %load.b to i32
1829  %mul = mul nsw i32 %ext.b, %ext.a
1830  %add = add nsw i32 %mul, %accum
1831  %iv.next = add nuw nsw i64 %iv, 1
1832  %exitcond.not = icmp eq i64 %iv.next, %N
1833  br i1 %exitcond.not, label %exit, label %for.body
1834
1835exit:                        ; preds = %for.body
1836  ret i32 %add
1837}
1838
1839define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
1840; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma(
1841; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
1842; CHECK-INTERLEAVE1-NEXT:  entry:
1843; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1844; CHECK-INTERLEAVE1:       vector.ph:
1845; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
1846; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
1847; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
1848; CHECK-INTERLEAVE1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
1849; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
1850; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
1851; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
1852; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP12]], 4
1853; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
1854; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP15]], 4
1855; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]]
1856; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
1857; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
1858; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
1859; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
1860; CHECK-INTERLEAVE1:       vector.body:
1861; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1862; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
1863; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
1864; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
1865; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
1866; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
1867; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
1868; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
1869; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
1870; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
1871; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
1872; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
1873; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
1874; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
1875; CHECK-INTERLEAVE1-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
1876; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
1877; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
1878; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
1879; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
1880; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
1881; CHECK-INTERLEAVE1:       middle.block:
1882; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
1883; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
1884; CHECK-INTERLEAVE1:       scalar.ph:
1885; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1886; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1887; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
1888; CHECK-INTERLEAVE1:       for.body:
1889; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
1890; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
1891; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
1892; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
1893; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
1894; CHECK-INTERLEAVE1-NEXT:    [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
1895; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1
1896; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
1897; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
1898; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
1899; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
1900; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
1901; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
1902; CHECK-INTERLEAVE1:       exit:
1903; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
1904; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
1905;
1906; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
1907; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
1908; CHECK-INTERLEAVED-NEXT:  entry:
1909; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1910; CHECK-INTERLEAVED:       vector.ph:
1911; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
1912; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
1913; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
1914; CHECK-INTERLEAVED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
1915; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
1916; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
1917; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
1918; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP12]], 4
1919; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
1920; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP15]], 4
1921; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]]
1922; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
1923; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
1924; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
1925; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
1926; CHECK-INTERLEAVED:       vector.body:
1927; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1928; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
1929; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
1930; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
1931; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
1932; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
1933; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
1934; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
1935; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
1936; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
1937; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
1938; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
1939; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
1940; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
1941; CHECK-INTERLEAVED-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
1942; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
1943; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
1944; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
1945; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
1946; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
1947; CHECK-INTERLEAVED:       middle.block:
1948; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
1949; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
1950; CHECK-INTERLEAVED:       scalar.ph:
1951; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1952; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1953; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
1954; CHECK-INTERLEAVED:       for.body:
1955; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
1956; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
1957; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
1958; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
1959; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
1960; CHECK-INTERLEAVED-NEXT:    [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
1961; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1
1962; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
1963; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
1964; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
1965; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
1966; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
1967; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
1968; CHECK-INTERLEAVED:       exit:
1969; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
1970; CHECK-INTERLEAVED-NEXT:    ret i32 [[ADD_LCSSA]]
1971;
1972; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
1973; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
1974; CHECK-MAXBW-NEXT:  entry:
1975; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1976; CHECK-MAXBW:       vector.ph:
1977; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
1978; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
1979; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
1980; CHECK-MAXBW-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
1981; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
1982; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
1983; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
1984; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
1985; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
1986; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
1987; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
1988; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
1989; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
1990; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
1991; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
1992; CHECK-MAXBW:       vector.body:
1993; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1994; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
1995; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
1996; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
1997; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
1998; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
1999; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
2000; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
2001; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
2002; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
2003; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
2004; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
2005; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
2006; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
2007; CHECK-MAXBW-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
2008; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
2009; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
2010; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
2011; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
2012; CHECK-MAXBW-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
2013; CHECK-MAXBW:       middle.block:
2014; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
2015; CHECK-MAXBW-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
2016; CHECK-MAXBW:       scalar.ph:
2017; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
2018; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
2019; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
2020; CHECK-MAXBW:       for.body:
2021; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
2022; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
2023; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]]
2024; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
2025; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32
2026; CHECK-MAXBW-NEXT:    [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
2027; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1
2028; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32
2029; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]]
2030; CHECK-MAXBW-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]]
2031; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
2032; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
2033; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
2034; CHECK-MAXBW:       exit:
2035; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
2036; CHECK-MAXBW-NEXT:    ret i32 [[ADD_LCSSA]]
2037;
2038entry:
2039  br label %for.body
2040
2041for.body:                                         ; preds = %entry, %for.body
2042  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
2043  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
2044  %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv
2045  %load.a = load i8, ptr %gep.a, align 1
2046  %ext.a = sext i8 %load.a to i32
2047  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv
2048  %load.b = load i8, ptr %gep.a2, align 1
2049  %ext.b = sext i8 %load.b to i32
2050  %mul = mul nsw i32 %ext.b, %ext.a
2051  %add = add nsw i32 %mul, %accum
2052  %iv.next = add nuw nsw i64 %iv, 1
2053  %exitcond.not = icmp eq i64 %iv.next, %N
2054  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7
2055
2056exit:                        ; preds = %for.body
2057  ret i32 %add
2058}
2059
2060define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
2061; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user(
2062; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
2063; CHECK-INTERLEAVE1-NEXT:  entry:
2064; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
2065; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
2066; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2067; CHECK-INTERLEAVE1:       vector.ph:
2068; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
2069; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP8]], 4
2070; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP10]]
2071; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
2072; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
2073; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP11]], 4
2074; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
2075; CHECK-INTERLEAVE1:       vector.body:
2076; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2077; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
2078; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
2079; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
2080; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
2081; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
2082; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
2083; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
2084; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
2085; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
2086; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
2087; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
2088; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
2089; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
2090; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2091; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
2092; CHECK-INTERLEAVE1:       middle.block:
2093; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
2094; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
2095; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 4
2096; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
2097; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i32> [[TMP12]], i32 [[TMP19]]
2098; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
2099; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
2100; CHECK-INTERLEAVE1:       scalar.ph:
2101; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
2102; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
2103; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
2104; CHECK-INTERLEAVE1:       for.body:
2105; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
2106; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
2107; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
2108; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
2109; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
2110; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
2111; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
2112; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
2113; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
2114; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
2115; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
2116; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
2117; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
2118; CHECK-INTERLEAVE1:       for.exit:
2119; CHECK-INTERLEAVE1-NEXT:    [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
2120; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
2121; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
2122; CHECK-INTERLEAVE1-NEXT:    ret i32 [[RESULT]]
2123;
2124; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
2125; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
2126; CHECK-INTERLEAVED-NEXT:  entry:
2127; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
2128; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
2129; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2130; CHECK-INTERLEAVED:       vector.ph:
2131; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
2132; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
2133; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP7]]
2134; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
2135; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
2136; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP14]], 8
2137; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
2138; CHECK-INTERLEAVED:       vector.body:
2139; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2140; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
2141; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
2142; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
2143; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
2144; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
2145; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
2146; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP15]], 4
2147; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]]
2148; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
2149; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
2150; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
2151; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
2152; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
2153; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
2154; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
2155; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
2156; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]]
2157; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
2158; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
2159; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
2160; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
2161; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
2162; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]]
2163; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
2164; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
2165; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
2166; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2167; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
2168; CHECK-INTERLEAVED:       middle.block:
2169; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
2170; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
2171; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
2172; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], 4
2173; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP28]], 1
2174; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 4 x i32> [[TMP20]], i32 [[TMP29]]
2175; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
2176; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
2177; CHECK-INTERLEAVED:       scalar.ph:
2178; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
2179; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP26]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
2180; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
2181; CHECK-INTERLEAVED:       for.body:
2182; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
2183; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
2184; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
2185; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
2186; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
2187; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
2188; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
2189; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
2190; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
2191; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
2192; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
2193; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
2194; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
2195; CHECK-INTERLEAVED:       for.exit:
2196; CHECK-INTERLEAVED-NEXT:    [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
2197; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
2198; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
2199; CHECK-INTERLEAVED-NEXT:    ret i32 [[RESULT]]
2200;
2201; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
2202; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
2203; CHECK-MAXBW-NEXT:  entry:
2204; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
2205; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
2206; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2207; CHECK-MAXBW:       vector.ph:
2208; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
2209; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
2210; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
2211; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
2212; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
2213; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
2214; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
2215; CHECK-MAXBW:       vector.body:
2216; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2217; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
2218; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
2219; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
2220; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
2221; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
2222; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
2223; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
2224; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
2225; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
2226; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
2227; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
2228; CHECK-MAXBW-NEXT:    [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
2229; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
2230; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2231; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
2232; CHECK-MAXBW:       middle.block:
2233; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]])
2234; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
2235; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 8
2236; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
2237; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 8 x i32> [[TMP20]], i32 [[TMP19]]
2238; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
2239; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
2240; CHECK-MAXBW:       scalar.ph:
2241; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
2242; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
2243; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
2244; CHECK-MAXBW:       for.body:
2245; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
2246; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
2247; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
2248; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
2249; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
2250; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
2251; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
2252; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
2253; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
2254; CHECK-MAXBW-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
2255; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
2256; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
2257; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
2258; CHECK-MAXBW:       for.exit:
2259; CHECK-MAXBW-NEXT:    [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
2260; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
2261; CHECK-MAXBW-NEXT:    [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
2262; CHECK-MAXBW-NEXT:    ret i32 [[RESULT]]
2263;
2264entry:
2265  br label %for.body
2266
2267for.body:                                         ; preds = %for.body, %entry
2268  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
2269  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
2270  %gep.a = getelementptr i8, ptr %a, i64 %iv
2271  %load.a = load i8, ptr %gep.a, align 1
2272  %ext.a = zext i8 %load.a to i32
2273  %gep.b = getelementptr i8, ptr %b, i64 %iv
2274  %load.b = load i8, ptr %gep.b, align 1
2275  %ext.b = zext i8 %load.b to i32
2276  %mul = mul i32 %ext.b, %ext.a
2277  %add = add i32 %mul, %accum
2278  %iv.next = add i64 %iv, 1
2279  %exitcond.not = icmp eq i64 %iv.next, 1024
2280  br i1 %exitcond.not, label %for.exit, label %for.body
2281
2282for.exit:                        ; preds = %for.body
2283  %result = add i32 %add, %ext.b
2284  ret i32 %result
2285}
2286
2287define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
2288; CHECK-INTERLEAVE1-LABEL: define i64 @dotp_cost_disagreement(
2289; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
2290; CHECK-INTERLEAVE1-NEXT:  entry:
2291; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
2292; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
2293; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
2294; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2295; CHECK-INTERLEAVE1:       vector.ph:
2296; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
2297; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
2298; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
2299; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
2300; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
2301; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
2302; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
2303; CHECK-INTERLEAVE1:       vector.body:
2304; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2305; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
2306; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
2307; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
2308; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
2309; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
2310; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
2311; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1
2312; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
2313; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0
2314; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i8>, ptr [[TMP12]], align 1
2315; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD1]] to <vscale x 2 x i64>
2316; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP9]]
2317; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]]
2318; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
2319; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2320; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
2321; CHECK-INTERLEAVE1:       middle.block:
2322; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]])
2323; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
2324; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
2325; CHECK-INTERLEAVE1:       scalar.ph:
2326; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
2327; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
2328; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
2329; CHECK-INTERLEAVE1:       for.body:
2330; CHECK-INTERLEAVE1-NEXT:    [[I_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_IV_NEXT:%.*]], [[FOR_BODY]] ]
2331; CHECK-INTERLEAVE1-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
2332; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[I_IV]]
2333; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
2334; CHECK-INTERLEAVE1-NEXT:    [[CONV:%.*]] = zext i8 [[TMP18]] to i64
2335; CHECK-INTERLEAVE1-NEXT:    [[I_IV_NEXT]] = add nuw nsw i64 [[I_IV]], 1
2336; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[I_IV_NEXT]]
2337; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
2338; CHECK-INTERLEAVE1-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP19]] to i64
2339; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]]
2340; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i64 [[SUM]], [[MUL]]
2341; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16
2342; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
2343; CHECK-INTERLEAVE1:       exit:
2344; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
2345; CHECK-INTERLEAVE1-NEXT:    ret i64 [[ADD_LCSSA]]
2346;
2347; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement(
2348; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
2349; CHECK-INTERLEAVED-NEXT:  entry:
2350; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
2351; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
2352; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
2353; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2354; CHECK-INTERLEAVED:       vector.ph:
2355; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
2356; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
2357; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
2358; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
2359; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
2360; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
2361; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
2362; CHECK-INTERLEAVED:       vector.body:
2363; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2364; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
2365; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
2366; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
2367; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
2368; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
2369; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
2370; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
2371; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]]
2372; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
2373; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
2374; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
2375; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
2376; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 1
2377; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]]
2378; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 0
2379; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
2380; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
2381; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]]
2382; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
2383; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
2384; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
2385; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
2386; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP12]]
2387; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP13]]
2388; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP22]]
2389; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]]
2390; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
2391; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2392; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
2393; CHECK-INTERLEAVED:       middle.block:
2394; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
2395; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
2396; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
2397; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
2398; CHECK-INTERLEAVED:       scalar.ph:
2399; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
2400; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
2401; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
2402; CHECK-INTERLEAVED:       for.body:
2403; CHECK-INTERLEAVED-NEXT:    [[I_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_IV_NEXT:%.*]], [[FOR_BODY]] ]
2404; CHECK-INTERLEAVED-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
2405; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[I_IV]]
2406; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
2407; CHECK-INTERLEAVED-NEXT:    [[CONV:%.*]] = zext i8 [[TMP28]] to i64
2408; CHECK-INTERLEAVED-NEXT:    [[I_IV_NEXT]] = add nuw nsw i64 [[I_IV]], 1
2409; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[I_IV_NEXT]]
2410; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
2411; CHECK-INTERLEAVED-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP29]] to i64
2412; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]]
2413; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i64 [[SUM]], [[MUL]]
2414; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16
2415; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
2416; CHECK-INTERLEAVED:       exit:
2417; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
2418; CHECK-INTERLEAVED-NEXT:    ret i64 [[ADD_LCSSA]]
2419;
2420; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement(
2421; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
2422; CHECK-MAXBW-NEXT:  entry:
2423; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
2424; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
2425; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
2426; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2427; CHECK-MAXBW:       vector.ph:
2428; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
2429; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
2430; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
2431; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
2432; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
2433; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
2434; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
2435; CHECK-MAXBW:       vector.body:
2436; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2437; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
2438; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
2439; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
2440; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
2441; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
2442; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
2443; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1
2444; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
2445; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0
2446; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
2447; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
2448; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP9]]
2449; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i64> @llvm.experimental.vector.partial.reduce.add.nxv1i64.nxv8i64(<vscale x 1 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[TMP14]])
2450; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
2451; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2452; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
2453; CHECK-MAXBW:       middle.block:
2454; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[PARTIAL_REDUCE]])
2455; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
2456; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
2457; CHECK-MAXBW:       scalar.ph:
2458; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
2459; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
2460; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
2461; CHECK-MAXBW:       for.body:
2462; CHECK-MAXBW-NEXT:    [[I_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_IV_NEXT:%.*]], [[FOR_BODY]] ]
2463; CHECK-MAXBW-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
2464; CHECK-MAXBW-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[I_IV]]
2465; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
2466; CHECK-MAXBW-NEXT:    [[CONV:%.*]] = zext i8 [[TMP17]] to i64
2467; CHECK-MAXBW-NEXT:    [[I_IV_NEXT]] = add nuw nsw i64 [[I_IV]], 1
2468; CHECK-MAXBW-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[I_IV_NEXT]]
2469; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
2470; CHECK-MAXBW-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP18]] to i64
2471; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]]
2472; CHECK-MAXBW-NEXT:    [[ADD]] = add i64 [[SUM]], [[MUL]]
2473; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16
2474; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
2475; CHECK-MAXBW:       exit:
2476; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
2477; CHECK-MAXBW-NEXT:    ret i64 [[ADD_LCSSA]]
2478;
2479entry:
2480  br label %for.body
2481
2482for.body:                                         ; preds = %entry, %for.body
2483  %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
2484  %sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
2485  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv
2486  %0 = load i8, ptr %arrayidx, align 1
2487  %conv = zext i8 %0 to i64
2488  %i.iv.next = add nuw nsw i64 %i.iv, 1
2489  %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next
2490  %1 = load i8, ptr %arrayidx2, align 1
2491  %conv3 = zext i8 %1 to i64
2492  %mul = mul nuw nsw i64 %conv3, %conv
2493  %add = add i64 %sum, %mul
2494  %exitcond.not = icmp eq i64 %i.iv.next, 16
2495  br i1 %exitcond.not, label %exit, label %for.body
2496
2497exit:                                 ; preds = %for.body
2498  ret i64 %add
2499}
2500
2501define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 {
2502; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_not_phi2(
2503; CHECK-INTERLEAVE1-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
2504; CHECK-INTERLEAVE1-NEXT:  entry:
2505; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
2506; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label [[FOR_PREHEADER:%.*]], label [[EXIT:%.*]]
2507; CHECK-INTERLEAVE1:       for.preheader:
2508; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr null, align 1
2509; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A1:%.*]] = load i8, ptr inttoptr (i64 1 to ptr), align 1
2510; CHECK-INTERLEAVE1-NEXT:    [[A_EXT:%.*]] = sext i8 [[LOAD_A]] to i32
2511; CHECK-INTERLEAVE1-NEXT:    [[A_EXT1:%.*]] = sext i8 [[LOAD_A1]] to i32
2512; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
2513; CHECK-INTERLEAVE1:       for.body:
2514; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ]
2515; CHECK-INTERLEAVE1-NEXT:    [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[MATRIX]], [[FOR_PREHEADER]] ]
2516; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[ADD_1:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ]
2517; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[PTR]], i64 1
2518; CHECK-INTERLEAVE1-NEXT:    [[GEP_B1:%.*]] = getelementptr i8, ptr [[PTR]], i64 2
2519; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
2520; CHECK-INTERLEAVE1-NEXT:    [[B_EXT:%.*]] = sext i8 [[LOAD_B]] to i32
2521; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
2522; CHECK-INTERLEAVE1-NEXT:    [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]]
2523; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1
2524; CHECK-INTERLEAVE1-NEXT:    [[B_EXT1:%.*]] = sext i8 [[LOAD_B1]] to i32
2525; CHECK-INTERLEAVE1-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[A_EXT1]], [[B_EXT1]]
2526; CHECK-INTERLEAVE1-NEXT:    [[ADD_1]] = add i32 [[MUL_1]], [[ADD]]
2527; CHECK-INTERLEAVE1-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16
2528; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
2529; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
2530; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
2531; CHECK-INTERLEAVE1:       for.exit:
2532; CHECK-INTERLEAVE1-NEXT:    [[ADD_1_LCSSA:%.*]] = phi i32 [ [[ADD_1]], [[FOR_BODY]] ]
2533; CHECK-INTERLEAVE1-NEXT:    [[ADD_FLOAT:%.*]] = sitofp i32 [[ADD_1_LCSSA]] to float
2534; CHECK-INTERLEAVE1-NEXT:    br label [[EXIT]]
2535; CHECK-INTERLEAVE1:       exit:
2536; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_FLOAT]], [[FOR_EXIT]] ]
2537; CHECK-INTERLEAVE1-NEXT:    store float [[RESULT]], ptr [[MATRIX]], align 4
2538; CHECK-INTERLEAVE1-NEXT:    ret void
2539;
2540; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_phi2(
2541; CHECK-INTERLEAVED-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
2542; CHECK-INTERLEAVED-NEXT:  entry:
2543; CHECK-INTERLEAVED-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
2544; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label [[FOR_PREHEADER:%.*]], label [[EXIT:%.*]]
2545; CHECK-INTERLEAVED:       for.preheader:
2546; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr null, align 1
2547; CHECK-INTERLEAVED-NEXT:    [[LOAD_A1:%.*]] = load i8, ptr inttoptr (i64 1 to ptr), align 1
2548; CHECK-INTERLEAVED-NEXT:    [[A_EXT:%.*]] = sext i8 [[LOAD_A]] to i32
2549; CHECK-INTERLEAVED-NEXT:    [[A_EXT1:%.*]] = sext i8 [[LOAD_A1]] to i32
2550; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
2551; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
2552; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2553; CHECK-INTERLEAVED:       vector.ph:
2554; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
2555; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
2556; CHECK-INTERLEAVED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
2557; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[N_VEC]], 16
2558; CHECK-INTERLEAVED-NEXT:    [[IND_END1:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP1]]
2559; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
2560; CHECK-INTERLEAVED:       vector.body:
2561; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2562; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
2563; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
2564; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
2565; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0
2566; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 16
2567; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]]
2568; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP3]]
2569; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 1
2570; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 1
2571; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 2
2572; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 2
2573; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP4]], align 1
2574; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP5]], align 1
2575; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sext i8 [[TMP8]] to i32
2576; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = sext i8 [[TMP9]] to i32
2577; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul nsw i32 [[A_EXT]], [[TMP10]]
2578; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul nsw i32 [[A_EXT]], [[TMP11]]
2579; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i32 [[TMP12]], [[VEC_PHI]]
2580; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], [[VEC_PHI2]]
2581; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP6]], align 1
2582; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP7]], align 1
2583; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext i8 [[TMP16]] to i32
2584; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = sext i8 [[TMP17]] to i32
2585; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nsw i32 [[A_EXT1]], [[TMP18]]
2586; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw i32 [[A_EXT1]], [[TMP19]]
2587; CHECK-INTERLEAVED-NEXT:    [[TMP22]] = add i32 [[TMP20]], [[TMP14]]
2588; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add i32 [[TMP21]], [[TMP15]]
2589; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
2590; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2591; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
2592; CHECK-INTERLEAVED:       middle.block:
2593; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]]
2594; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
2595; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
2596; CHECK-INTERLEAVED:       scalar.ph:
2597; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
2598; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[MATRIX]], [[FOR_PREHEADER]] ]
2599; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
2600; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
2601; CHECK-INTERLEAVED:       for.body:
2602; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
2603; CHECK-INTERLEAVED-NEXT:    [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
2604; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[ADD_1:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
2605; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[PTR]], i64 1
2606; CHECK-INTERLEAVED-NEXT:    [[GEP_B1:%.*]] = getelementptr i8, ptr [[PTR]], i64 2
2607; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
2608; CHECK-INTERLEAVED-NEXT:    [[B_EXT:%.*]] = sext i8 [[LOAD_B]] to i32
2609; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
2610; CHECK-INTERLEAVED-NEXT:    [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]]
2611; CHECK-INTERLEAVED-NEXT:    [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1
2612; CHECK-INTERLEAVED-NEXT:    [[B_EXT1:%.*]] = sext i8 [[LOAD_B1]] to i32
2613; CHECK-INTERLEAVED-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[A_EXT1]], [[B_EXT1]]
2614; CHECK-INTERLEAVED-NEXT:    [[ADD_1]] = add i32 [[MUL_1]], [[ADD]]
2615; CHECK-INTERLEAVED-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16
2616; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
2617; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
2618; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
2619; CHECK-INTERLEAVED:       for.exit:
2620; CHECK-INTERLEAVED-NEXT:    [[ADD_1_LCSSA:%.*]] = phi i32 [ [[ADD_1]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
2621; CHECK-INTERLEAVED-NEXT:    [[ADD_FLOAT:%.*]] = sitofp i32 [[ADD_1_LCSSA]] to float
2622; CHECK-INTERLEAVED-NEXT:    br label [[EXIT]]
2623; CHECK-INTERLEAVED:       exit:
2624; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_FLOAT]], [[FOR_EXIT]] ]
2625; CHECK-INTERLEAVED-NEXT:    store float [[RESULT]], ptr [[MATRIX]], align 4
2626; CHECK-INTERLEAVED-NEXT:    ret void
2627;
2628; CHECK-MAXBW-LABEL: define void @not_dotp_not_phi2(
2629; CHECK-MAXBW-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
2630; CHECK-MAXBW-NEXT:  entry:
2631; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
2632; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[FOR_PREHEADER:%.*]], label [[EXIT:%.*]]
2633; CHECK-MAXBW:       for.preheader:
2634; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i8, ptr null, align 1
2635; CHECK-MAXBW-NEXT:    [[LOAD_A1:%.*]] = load i8, ptr inttoptr (i64 1 to ptr), align 1
2636; CHECK-MAXBW-NEXT:    [[A_EXT:%.*]] = sext i8 [[LOAD_A]] to i32
2637; CHECK-MAXBW-NEXT:    [[A_EXT1:%.*]] = sext i8 [[LOAD_A1]] to i32
2638; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
2639; CHECK-MAXBW:       for.body:
2640; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ]
2641; CHECK-MAXBW-NEXT:    [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[MATRIX]], [[FOR_PREHEADER]] ]
2642; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[ADD_1:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ]
2643; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[PTR]], i64 1
2644; CHECK-MAXBW-NEXT:    [[GEP_B1:%.*]] = getelementptr i8, ptr [[PTR]], i64 2
2645; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
2646; CHECK-MAXBW-NEXT:    [[B_EXT:%.*]] = sext i8 [[LOAD_B]] to i32
2647; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]]
2648; CHECK-MAXBW-NEXT:    [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]]
2649; CHECK-MAXBW-NEXT:    [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1
2650; CHECK-MAXBW-NEXT:    [[B_EXT1:%.*]] = sext i8 [[LOAD_B1]] to i32
2651; CHECK-MAXBW-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[A_EXT1]], [[B_EXT1]]
2652; CHECK-MAXBW-NEXT:    [[ADD_1]] = add i32 [[MUL_1]], [[ADD]]
2653; CHECK-MAXBW-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16
2654; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
2655; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
2656; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
2657; CHECK-MAXBW:       for.exit:
2658; CHECK-MAXBW-NEXT:    [[ADD_1_LCSSA:%.*]] = phi i32 [ [[ADD_1]], [[FOR_BODY]] ]
2659; CHECK-MAXBW-NEXT:    [[ADD_FLOAT:%.*]] = sitofp i32 [[ADD_1_LCSSA]] to float
2660; CHECK-MAXBW-NEXT:    br label [[EXIT]]
2661; CHECK-MAXBW:       exit:
2662; CHECK-MAXBW-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_FLOAT]], [[FOR_EXIT]] ]
2663; CHECK-MAXBW-NEXT:    store float [[RESULT]], ptr [[MATRIX]], align 4
2664; CHECK-MAXBW-NEXT:    ret void
2665;
2666entry:
2667  %cmp = icmp sgt i32 %n, 0
2668  br i1 %cmp, label %for.preheader, label %exit
2669
2670for.preheader:                   ; preds = %entry
2671  %load.a = load i8, ptr inttoptr (i64 0 to ptr), align 1
2672  %load.a1 = load i8, ptr inttoptr (i64 1 to ptr), align 1
2673  %a.ext = sext i8 %load.a to i32
2674  %a.ext1 = sext i8 %load.a1 to i32
2675  br label %for.body
2676
2677for.body:                             ; preds = %for.preheader, %for.body
2678  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %for.preheader ]
2679  %ptr = phi ptr [ %scevgep, %for.body ], [ %matrix, %for.preheader ]
2680  %accum = phi i32 [ %add.1, %for.body ], [ 0, %for.preheader ]
2681  %gep.b = getelementptr i8, ptr %ptr, i64 1
2682  %gep.b1 = getelementptr i8, ptr %ptr, i64 2
2683  %load.b = load i8, ptr %gep.b, align 1
2684  %b.ext = sext i8 %load.b to i32
2685  %mul = mul nsw i32 %a.ext, %b.ext
2686  %add = add i32 %mul, %accum
2687  %load.b1 = load i8, ptr %gep.b1, align 1
2688  %b.ext1 = sext i8 %load.b1 to i32
2689  %mul.1 = mul nsw i32 %a.ext1, %b.ext1
2690  %add.1 = add i32 %mul.1, %add
2691  %scevgep = getelementptr i8, ptr %ptr, i64 16
2692  %iv.next = add nuw nsw i32 %iv, 1
2693  %exitcond.not = icmp eq i32 %iv.next, %n
2694  br i1 %exitcond.not, label %for.exit, label %for.body
2695
2696for.exit:                       ; preds = %for.body
2697  %add.1.lcssa = phi i32 [ %add.1, %for.body ]
2698  %add.float = sitofp i32 %add.1.lcssa to float
2699  br label %exit
2700
2701exit:                                ; preds = %for.exit, %entry
2702  %result = phi float [ 0.000000e+00, %entry ], [ %add.float, %for.exit ]
2703  store float %result, ptr %matrix, align 4
2704  ret void
2705}
2706
2707define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
2708; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_ext_outside_plan(
2709; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
2710; CHECK-INTERLEAVE1-NEXT:  entry:
2711; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
2712; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
2713; CHECK-INTERLEAVE1:       for.ph:
2714; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
2715; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
2716; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2717; CHECK-INTERLEAVE1:       vector.ph:
2718; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
2719; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
2720; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
2721; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
2722; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
2723; CHECK-INTERLEAVE1:       vector.body:
2724; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2725; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
2726; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
2727; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]]
2728; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0
2729; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
2730; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
2731; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[TMP3]], [[BROADCAST_SPLAT]]
2732; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
2733; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
2734; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2735; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
2736; CHECK-INTERLEAVE1:       middle.block:
2737; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
2738; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
2739; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
2740; CHECK-INTERLEAVE1:       scalar.ph:
2741; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ]
2742; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ]
2743; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
2744; CHECK-INTERLEAVE1:       for.body:
2745; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
2746; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
2747; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]]
2748; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2
2749; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64
2750; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[EXT_A]], [[EXT_B]]
2751; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i64 [[MUL]], [[ACCUM]]
2752; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
2753; CHECK-INTERLEAVE1-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
2754; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
2755; CHECK-INTERLEAVE1:       exit.loopexit:
2756; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
2757; CHECK-INTERLEAVE1-NEXT:    br label [[EXIT]]
2758; CHECK-INTERLEAVE1:       exit:
2759; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ]
2760; CHECK-INTERLEAVE1-NEXT:    ret i64 [[RESULT]]
2761;
2762; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_ext_outside_plan(
2763; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
2764; CHECK-INTERLEAVED-NEXT:  entry:
2765; CHECK-INTERLEAVED-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
2766; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
2767; CHECK-INTERLEAVED:       for.ph:
2768; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
2769; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
2770; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2771; CHECK-INTERLEAVED:       vector.ph:
2772; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
2773; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
2774; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
2775; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
2776; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
2777; CHECK-INTERLEAVED:       vector.body:
2778; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2779; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
2780; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
2781; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
2782; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]]
2783; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0
2784; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8
2785; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
2786; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
2787; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
2788; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
2789; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[BROADCAST_SPLAT]]
2790; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
2791; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <8 x i64> [[TMP6]], [[VEC_PHI]]
2792; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
2793; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
2794; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2795; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
2796; CHECK-INTERLEAVED:       middle.block:
2797; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
2798; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
2799; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
2800; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
2801; CHECK-INTERLEAVED:       scalar.ph:
2802; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ]
2803; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ]
2804; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
2805; CHECK-INTERLEAVED:       for.body:
2806; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
2807; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
2808; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]]
2809; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2
2810; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64
2811; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[EXT_A]], [[EXT_B]]
2812; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i64 [[MUL]], [[ACCUM]]
2813; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
2814; CHECK-INTERLEAVED-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
2815; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
2816; CHECK-INTERLEAVED:       exit.loopexit:
2817; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
2818; CHECK-INTERLEAVED-NEXT:    br label [[EXIT]]
2819; CHECK-INTERLEAVED:       exit:
2820; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ]
2821; CHECK-INTERLEAVED-NEXT:    ret i64 [[RESULT]]
2822;
2823; CHECK-MAXBW-LABEL: define i64 @not_dotp_ext_outside_plan(
2824; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
2825; CHECK-MAXBW-NEXT:  entry:
2826; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
2827; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
2828; CHECK-MAXBW:       for.ph:
2829; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
2830; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
2831; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
2832; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
2833; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2834; CHECK-MAXBW:       vector.ph:
2835; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
2836; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
2837; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
2838; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
2839; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
2840; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
2841; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0
2842; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
2843; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
2844; CHECK-MAXBW:       vector.body:
2845; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2846; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
2847; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
2848; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP6]]
2849; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP7]], i32 0
2850; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP8]], align 2
2851; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
2852; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP9]], [[BROADCAST_SPLAT]]
2853; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
2854; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
2855; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2856; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
2857; CHECK-MAXBW:       middle.block:
2858; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
2859; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
2860; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
2861; CHECK-MAXBW:       scalar.ph:
2862; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ]
2863; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ]
2864; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
2865; CHECK-MAXBW:       for.body:
2866; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
2867; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
2868; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]]
2869; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2
2870; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64
2871; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[EXT_A]], [[EXT_B]]
2872; CHECK-MAXBW-NEXT:    [[ADD]] = add i64 [[MUL]], [[ACCUM]]
2873; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
2874; CHECK-MAXBW-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
2875; CHECK-MAXBW-NEXT:    br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
2876; CHECK-MAXBW:       exit.loopexit:
2877; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
2878; CHECK-MAXBW-NEXT:    br label [[EXIT]]
2879; CHECK-MAXBW:       exit:
2880; CHECK-MAXBW-NEXT:    [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ]
2881; CHECK-MAXBW-NEXT:    ret i64 [[RESULT]]
2882;
2883entry:
2884  %cmp = icmp eq i64 %n, 0
2885  br i1 %cmp, label %exit, label %for.ph
2886
2887for.ph:                                   ; preds = %entry
2888  %ext.b = zext i16 %b to i64
2889  br label %for.body
2890
2891for.body:                                         ; preds = %for.body.lr.ph, %for.body
2892  %iv = phi i64 [ 0, %for.ph ], [ %iv.next, %for.body ]
2893  %accum = phi i64 [ 0, %for.ph ], [ %add, %for.body ]
2894  %gep.a = getelementptr inbounds nuw i16, ptr %a, i64 %iv
2895  %load.a = load i16, ptr %gep.a, align 2
2896  %ext.a = zext i16 %load.a to i64
2897  %mul = mul nuw nsw i64 %ext.a, %ext.b
2898  %add = add i64 %mul, %accum
2899  %iv.next = add nuw nsw i64 %iv, 1
2900  %cmp.1 = icmp eq i64 %iv.next, %n
2901  br i1 %cmp.1, label %exit, label %for.body
2902
2903exit:                                 ; preds = %for.cond.cleanup.loopexit, %entry
2904  %result = phi i64 [ 0, %entry ], [ %add, %for.body ]
2905  ret i64 %result
2906}
2907
2908define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
2909; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_ext_outside_plan2(
2910; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
2911; CHECK-INTERLEAVE1-NEXT:  entry:
2912; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
2913; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
2914; CHECK-INTERLEAVE1:       for.ph:
2915; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
2916; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
2917; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2918; CHECK-INTERLEAVE1:       vector.ph:
2919; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
2920; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
2921; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
2922; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
2923; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
2924; CHECK-INTERLEAVE1:       vector.body:
2925; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2926; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
2927; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
2928; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]]
2929; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0
2930; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
2931; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
2932; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP3]]
2933; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
2934; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
2935; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2936; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
2937; CHECK-INTERLEAVE1:       middle.block:
2938; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
2939; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
2940; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
2941; CHECK-INTERLEAVE1:       scalar.ph:
2942; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ]
2943; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ]
2944; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
2945; CHECK-INTERLEAVE1:       for.body:
2946; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
2947; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
2948; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]]
2949; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2
2950; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64
2951; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]]
2952; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i64 [[MUL]], [[ACCUM]]
2953; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
2954; CHECK-INTERLEAVE1-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
2955; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
2956; CHECK-INTERLEAVE1:       exit.loopexit:
2957; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
2958; CHECK-INTERLEAVE1-NEXT:    br label [[EXIT]]
2959; CHECK-INTERLEAVE1:       exit:
2960; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ]
2961; CHECK-INTERLEAVE1-NEXT:    ret i64 [[RESULT]]
2962;
2963; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_ext_outside_plan2(
2964; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
2965; CHECK-INTERLEAVED-NEXT:  entry:
2966; CHECK-INTERLEAVED-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
2967; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
2968; CHECK-INTERLEAVED:       for.ph:
2969; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
2970; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
2971; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2972; CHECK-INTERLEAVED:       vector.ph:
2973; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
2974; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
2975; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
2976; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
2977; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
2978; CHECK-INTERLEAVED:       vector.body:
2979; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2980; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
2981; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
2982; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
2983; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]]
2984; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0
2985; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8
2986; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
2987; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
2988; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
2989; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
2990; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP4]]
2991; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP5]]
2992; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <8 x i64> [[TMP6]], [[VEC_PHI]]
2993; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
2994; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
2995; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2996; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
2997; CHECK-INTERLEAVED:       middle.block:
2998; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
2999; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
3000; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
3001; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
3002; CHECK-INTERLEAVED:       scalar.ph:
3003; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ]
3004; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ]
3005; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
3006; CHECK-INTERLEAVED:       for.body:
3007; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
3008; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
3009; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]]
3010; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2
3011; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64
3012; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]]
3013; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i64 [[MUL]], [[ACCUM]]
3014; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
3015; CHECK-INTERLEAVED-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
3016; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
3017; CHECK-INTERLEAVED:       exit.loopexit:
3018; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
3019; CHECK-INTERLEAVED-NEXT:    br label [[EXIT]]
3020; CHECK-INTERLEAVED:       exit:
3021; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ]
3022; CHECK-INTERLEAVED-NEXT:    ret i64 [[RESULT]]
3023;
3024; CHECK-MAXBW-LABEL: define i64 @not_dotp_ext_outside_plan2(
3025; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
3026; CHECK-MAXBW-NEXT:  entry:
3027; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
3028; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
3029; CHECK-MAXBW:       for.ph:
3030; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
3031; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
3032; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
3033; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
3034; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
3035; CHECK-MAXBW:       vector.ph:
3036; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
3037; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
3038; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
3039; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
3040; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
3041; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
3042; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0
3043; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
3044; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
3045; CHECK-MAXBW:       vector.body:
3046; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
3047; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
3048; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
3049; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP6]]
3050; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP7]], i32 0
3051; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP8]], align 2
3052; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
3053; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
3054; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
3055; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
3056; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
3057; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
3058; CHECK-MAXBW:       middle.block:
3059; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
3060; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
3061; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
3062; CHECK-MAXBW:       scalar.ph:
3063; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ]
3064; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ]
3065; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
3066; CHECK-MAXBW:       for.body:
3067; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
3068; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
3069; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]]
3070; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2
3071; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64
3072; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]]
3073; CHECK-MAXBW-NEXT:    [[ADD]] = add i64 [[MUL]], [[ACCUM]]
3074; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
3075; CHECK-MAXBW-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
3076; CHECK-MAXBW-NEXT:    br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
3077; CHECK-MAXBW:       exit.loopexit:
3078; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
3079; CHECK-MAXBW-NEXT:    br label [[EXIT]]
3080; CHECK-MAXBW:       exit:
3081; CHECK-MAXBW-NEXT:    [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ]
3082; CHECK-MAXBW-NEXT:    ret i64 [[RESULT]]
3083;
3084entry:
3085  %cmp = icmp eq i64 %n, 0
3086  br i1 %cmp, label %exit, label %for.ph
3087
3088for.ph:                                   ; preds = %entry
3089  %ext.b = zext i16 %b to i64
3090  br label %for.body
3091
3092for.body:                                         ; preds = %for.body.lr.ph, %for.body
3093  %iv = phi i64 [ 0, %for.ph ], [ %iv.next, %for.body ]
3094  %accum = phi i64 [ 0, %for.ph ], [ %add, %for.body ]
3095  %gep.a = getelementptr inbounds nuw i16, ptr %a, i64 %iv
3096  %load.a = load i16, ptr %gep.a, align 2
3097  %ext.a = zext i16 %load.a to i64
3098  %mul = mul nuw nsw i64 %ext.b, %ext.a
3099  %add = add i64 %mul, %accum
3100  %iv.next = add nuw nsw i64 %iv, 1
3101  %cmp.1 = icmp eq i64 %iv.next, %n
3102  br i1 %cmp.1, label %exit, label %for.body
3103
3104exit:                                 ; preds = %for.cond.cleanup.loopexit, %entry
3105  %result = phi i64 [ 0, %entry ], [ %add, %for.body ]
3106  ret i64 %result
3107}
3108
3109!7 = distinct !{!7, !8, !9, !10}
3110!8 = !{!"llvm.loop.mustprogress"}
3111!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
3112!10 = !{!"llvm.loop.vectorize.enable", i1 true}
3113attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
3114;.
3115; CHECK-INTERLEAVE1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
3116; CHECK-INTERLEAVE1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
3117; CHECK-INTERLEAVE1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
3118; CHECK-INTERLEAVE1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
3119; CHECK-INTERLEAVE1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
3120; CHECK-INTERLEAVE1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
3121; CHECK-INTERLEAVE1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
3122; CHECK-INTERLEAVE1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
3123; CHECK-INTERLEAVE1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
3124; CHECK-INTERLEAVE1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
3125; CHECK-INTERLEAVE1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
3126; CHECK-INTERLEAVE1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
3127; CHECK-INTERLEAVE1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
3128; CHECK-INTERLEAVE1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
3129; CHECK-INTERLEAVE1: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
3130; CHECK-INTERLEAVE1: [[META15]] = !{!"llvm.loop.mustprogress"}
3131; CHECK-INTERLEAVE1: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
3132; CHECK-INTERLEAVE1: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
3133; CHECK-INTERLEAVE1: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
3134; CHECK-INTERLEAVE1: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
3135; CHECK-INTERLEAVE1: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]}
3136; CHECK-INTERLEAVE1: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
3137; CHECK-INTERLEAVE1: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]}
3138; CHECK-INTERLEAVE1: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
3139; CHECK-INTERLEAVE1: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
3140;.
3141; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
3142; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
3143; CHECK-INTERLEAVED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
3144; CHECK-INTERLEAVED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
3145; CHECK-INTERLEAVED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
3146; CHECK-INTERLEAVED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
3147; CHECK-INTERLEAVED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
3148; CHECK-INTERLEAVED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
3149; CHECK-INTERLEAVED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
3150; CHECK-INTERLEAVED: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
3151; CHECK-INTERLEAVED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
3152; CHECK-INTERLEAVED: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
3153; CHECK-INTERLEAVED: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
3154; CHECK-INTERLEAVED: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
3155; CHECK-INTERLEAVED: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
3156; CHECK-INTERLEAVED: [[META15]] = !{!"llvm.loop.mustprogress"}
3157; CHECK-INTERLEAVED: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
3158; CHECK-INTERLEAVED: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
3159; CHECK-INTERLEAVED: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
3160; CHECK-INTERLEAVED: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
3161; CHECK-INTERLEAVED: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]}
3162; CHECK-INTERLEAVED: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
3163; CHECK-INTERLEAVED: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]]}
3164; CHECK-INTERLEAVED: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
3165; CHECK-INTERLEAVED: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
3166; CHECK-INTERLEAVED: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]]}
3167; CHECK-INTERLEAVED: [[LOOP26]] = distinct !{[[LOOP26]], [[META2]], [[META1]]}
3168;.
3169; CHECK-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
3170; CHECK-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
3171; CHECK-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
3172; CHECK-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
3173; CHECK-MAXBW: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
3174; CHECK-MAXBW: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
3175; CHECK-MAXBW: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
3176; CHECK-MAXBW: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
3177; CHECK-MAXBW: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
3178; CHECK-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
3179; CHECK-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
3180; CHECK-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
3181; CHECK-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
3182; CHECK-MAXBW: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
3183; CHECK-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]}
3184; CHECK-MAXBW: [[META15]] = !{!"llvm.loop.mustprogress"}
3185; CHECK-MAXBW: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]}
3186; CHECK-MAXBW: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
3187; CHECK-MAXBW: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
3188; CHECK-MAXBW: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
3189; CHECK-MAXBW: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]}
3190; CHECK-MAXBW: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
3191; CHECK-MAXBW: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]}
3192; CHECK-MAXBW: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]}
3193; CHECK-MAXBW: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]}
3194;.
3195