1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -S -passes=slp-vectorizer,dce,instcombine < %s | FileCheck %s --check-prefix=GENERIC 3; RUN: opt -S -mcpu=kryo -passes=slp-vectorizer,dce,instcombine < %s | FileCheck %s --check-prefix=KRYO 4 5target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" 6target triple = "aarch64--linux-gnu" 7 8; These tests check that we vectorize the index calculations in the 9; gather-reduce pattern shown below. We check cases having i32 and i64 10; subtraction. 11; 12; int gather_reduce_8x16(short *a, short *b, short *g, int n) { 13; int sum = 0; 14; for (int i = 0; i < n ; ++i) { 15; sum += g[*a++ - b[0]]; sum += g[*a++ - b[1]]; 16; sum += g[*a++ - b[2]]; sum += g[*a++ - b[3]]; 17; sum += g[*a++ - b[4]]; sum += g[*a++ - b[5]]; 18; sum += g[*a++ - b[6]]; sum += g[*a++ - b[7]]; 19; } 20; return sum; 21; } 22 23define i32 @gather_reduce_8x16_i32(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture readonly %g, i32 %n) { 24; GENERIC-LABEL: @gather_reduce_8x16_i32( 25; GENERIC-NEXT: entry: 26; GENERIC-NEXT: [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0 27; GENERIC-NEXT: br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] 28; GENERIC: for.body.preheader: 29; GENERIC-NEXT: br label [[FOR_BODY:%.*]] 30; GENERIC: for.cond.cleanup.loopexit: 31; GENERIC-NEXT: br label [[FOR_COND_CLEANUP]] 32; GENERIC: for.cond.cleanup: 33; GENERIC-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] 34; GENERIC-NEXT: ret i32 [[SUM_0_LCSSA]] 35; GENERIC: for.body: 36; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 37; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 38; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] 39; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16 40; GENERIC-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2 41; GENERIC-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32> 42; GENERIC-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2 43; GENERIC-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> 44; GENERIC-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]] 45; GENERIC-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0 46; GENERIC-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 47; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]] 48; GENERIC-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 49; GENERIC-NEXT: [[CONV3:%.*]] = zext i16 [[TMP7]] to i32 50; GENERIC-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]] 51; GENERIC-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1 52; GENERIC-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 53; GENERIC-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]] 54; GENERIC-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 55; GENERIC-NEXT: [[CONV11:%.*]] = zext i16 [[TMP10]] to i32 56; GENERIC-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]] 57; GENERIC-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2 58; GENERIC-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 59; GENERIC-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]] 60; GENERIC-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2 61; GENERIC-NEXT: [[CONV20:%.*]] = zext i16 [[TMP13]] to i32 62; GENERIC-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]] 63; GENERIC-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3 64; GENERIC-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 65; GENERIC-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]] 66; GENERIC-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2 67; GENERIC-NEXT: [[CONV29:%.*]] = zext i16 [[TMP16]] to i32 68; GENERIC-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]] 69; GENERIC-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4 70; GENERIC-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 71; GENERIC-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]] 72; GENERIC-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2 73; GENERIC-NEXT: [[CONV38:%.*]] = zext i16 [[TMP19]] to i32 74; GENERIC-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]] 75; GENERIC-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5 76; GENERIC-NEXT: [[TMP21:%.*]] = sext i32 [[TMP20]] to i64 77; GENERIC-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]] 78; GENERIC-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2 79; GENERIC-NEXT: [[CONV47:%.*]] = zext i16 [[TMP22]] to i32 80; GENERIC-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]] 81; GENERIC-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6 82; GENERIC-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64 83; GENERIC-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]] 84; GENERIC-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2 85; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP25]] to i32 86; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] 87; GENERIC-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7 88; GENERIC-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64 89; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]] 90; GENERIC-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2 91; GENERIC-NEXT: [[CONV65:%.*]] = zext i16 [[TMP28]] to i32 92; GENERIC-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] 93; GENERIC-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1 94; GENERIC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] 95; GENERIC-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] 96; 97; KRYO-LABEL: @gather_reduce_8x16_i32( 98; KRYO-NEXT: entry: 99; KRYO-NEXT: [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0 100; KRYO-NEXT: br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] 101; KRYO: for.body.preheader: 102; KRYO-NEXT: br label [[FOR_BODY:%.*]] 103; KRYO: for.cond.cleanup.loopexit: 104; KRYO-NEXT: br label [[FOR_COND_CLEANUP]] 105; KRYO: for.cond.cleanup: 106; KRYO-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] 107; KRYO-NEXT: ret i32 [[SUM_0_LCSSA]] 108; KRYO: for.body: 109; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 110; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 111; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] 112; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16 113; KRYO-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2 114; KRYO-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32> 115; KRYO-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2 116; KRYO-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> 117; KRYO-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]] 118; KRYO-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0 119; KRYO-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 120; KRYO-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]] 121; KRYO-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 122; KRYO-NEXT: [[CONV3:%.*]] = zext i16 [[TMP7]] to i32 123; KRYO-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]] 124; KRYO-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1 125; KRYO-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 126; KRYO-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]] 127; KRYO-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 128; KRYO-NEXT: [[CONV11:%.*]] = zext i16 [[TMP10]] to i32 129; KRYO-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]] 130; KRYO-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2 131; KRYO-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 132; KRYO-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]] 133; KRYO-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2 134; KRYO-NEXT: [[CONV20:%.*]] = zext i16 [[TMP13]] to i32 135; KRYO-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]] 136; KRYO-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3 137; KRYO-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 138; KRYO-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]] 139; KRYO-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2 140; KRYO-NEXT: [[CONV29:%.*]] = zext i16 [[TMP16]] to i32 141; KRYO-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]] 142; KRYO-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4 143; KRYO-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 144; KRYO-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]] 145; KRYO-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2 146; KRYO-NEXT: [[CONV38:%.*]] = zext i16 [[TMP19]] to i32 147; KRYO-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]] 148; KRYO-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5 149; KRYO-NEXT: [[TMP21:%.*]] = sext i32 [[TMP20]] to i64 150; KRYO-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]] 151; KRYO-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2 152; KRYO-NEXT: [[CONV47:%.*]] = zext i16 [[TMP22]] to i32 153; KRYO-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]] 154; KRYO-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6 155; KRYO-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64 156; KRYO-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]] 157; KRYO-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2 158; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP25]] to i32 159; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] 160; KRYO-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7 161; KRYO-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64 162; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]] 163; KRYO-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2 164; KRYO-NEXT: [[CONV65:%.*]] = zext i16 [[TMP28]] to i32 165; KRYO-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] 166; KRYO-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1 167; KRYO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] 168; KRYO-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] 169; 170entry: 171 %cmp.99 = icmp sgt i32 %n, 0 172 br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup 173 174for.body.preheader: 175 br label %for.body 176 177for.cond.cleanup.loopexit: 178 br label %for.cond.cleanup 179 180for.cond.cleanup: 181 %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ] 182 ret i32 %sum.0.lcssa 183 184for.body: 185 %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 186 %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ] 187 %a.addr.0101 = phi ptr [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ] 188 %incdec.ptr = getelementptr inbounds i16, ptr %a.addr.0101, i64 1 189 %0 = load i16, ptr %a.addr.0101, align 2 190 %conv = zext i16 %0 to i32 191 %incdec.ptr1 = getelementptr inbounds i16, ptr %b, i64 1 192 %1 = load i16, ptr %b, align 2 193 %conv2 = zext i16 %1 to i32 194 %sub = sub nsw i32 %conv, %conv2 195 %arrayidx = getelementptr inbounds i16, ptr %g, i32 %sub 196 %2 = load i16, ptr %arrayidx, align 2 197 %conv3 = zext i16 %2 to i32 198 %add = add nsw i32 %conv3, %sum.0102 199 %incdec.ptr4 = getelementptr inbounds i16, ptr %a.addr.0101, i64 2 200 %3 = load i16, ptr %incdec.ptr, align 2 201 %conv5 = zext i16 %3 to i32 202 %incdec.ptr6 = getelementptr inbounds i16, ptr %b, i64 2 203 %4 = load i16, ptr %incdec.ptr1, align 2 204 %conv7 = zext i16 %4 to i32 205 %sub8 = sub nsw i32 %conv5, %conv7 206 %arrayidx10 = getelementptr inbounds i16, ptr %g, i32 %sub8 207 %5 = load i16, ptr %arrayidx10, align 2 208 %conv11 = zext i16 %5 to i32 209 %add12 = add nsw i32 %add, %conv11 210 %incdec.ptr13 = getelementptr inbounds i16, ptr %a.addr.0101, i64 3 211 %6 = load i16, ptr %incdec.ptr4, align 2 212 %conv14 = zext i16 %6 to i32 213 %incdec.ptr15 = getelementptr inbounds i16, ptr %b, i64 3 214 %7 = load i16, ptr %incdec.ptr6, align 2 215 %conv16 = zext i16 %7 to i32 216 %sub17 = sub nsw i32 %conv14, %conv16 217 %arrayidx19 = getelementptr inbounds i16, ptr %g, i32 %sub17 218 %8 = load i16, ptr %arrayidx19, align 2 219 %conv20 = zext i16 %8 to i32 220 %add21 = add nsw i32 %add12, %conv20 221 %incdec.ptr22 = getelementptr inbounds i16, ptr %a.addr.0101, i64 4 222 %9 = load i16, ptr %incdec.ptr13, align 2 223 %conv23 = zext i16 %9 to i32 224 %incdec.ptr24 = getelementptr inbounds i16, ptr %b, i64 4 225 %10 = load i16, ptr %incdec.ptr15, align 2 226 %conv25 = zext i16 %10 to i32 227 %sub26 = sub nsw i32 %conv23, %conv25 228 %arrayidx28 = getelementptr inbounds i16, ptr %g, i32 %sub26 229 %11 = load i16, ptr %arrayidx28, align 2 230 %conv29 = zext i16 %11 to i32 231 %add30 = add nsw i32 %add21, %conv29 232 %incdec.ptr31 = getelementptr inbounds i16, ptr %a.addr.0101, i64 5 233 %12 = load i16, ptr %incdec.ptr22, align 2 234 %conv32 = zext i16 %12 to i32 235 %incdec.ptr33 = getelementptr inbounds i16, ptr %b, i64 5 236 %13 = load i16, ptr %incdec.ptr24, align 2 237 %conv34 = zext i16 %13 to i32 238 %sub35 = sub nsw i32 %conv32, %conv34 239 %arrayidx37 = getelementptr inbounds i16, ptr %g, i32 %sub35 240 %14 = load i16, ptr %arrayidx37, align 2 241 %conv38 = zext i16 %14 to i32 242 %add39 = add nsw i32 %add30, %conv38 243 %incdec.ptr40 = getelementptr inbounds i16, ptr %a.addr.0101, i64 6 244 %15 = load i16, ptr %incdec.ptr31, align 2 245 %conv41 = zext i16 %15 to i32 246 %incdec.ptr42 = getelementptr inbounds i16, ptr %b, i64 6 247 %16 = load i16, ptr %incdec.ptr33, align 2 248 %conv43 = zext i16 %16 to i32 249 %sub44 = sub nsw i32 %conv41, %conv43 250 %arrayidx46 = getelementptr inbounds i16, ptr %g, i32 %sub44 251 %17 = load i16, ptr %arrayidx46, align 2 252 %conv47 = zext i16 %17 to i32 253 %add48 = add nsw i32 %add39, %conv47 254 %incdec.ptr49 = getelementptr inbounds i16, ptr %a.addr.0101, i64 7 255 %18 = load i16, ptr %incdec.ptr40, align 2 256 %conv50 = zext i16 %18 to i32 257 %incdec.ptr51 = getelementptr inbounds i16, ptr %b, i64 7 258 %19 = load i16, ptr %incdec.ptr42, align 2 259 %conv52 = zext i16 %19 to i32 260 %sub53 = sub nsw i32 %conv50, %conv52 261 %arrayidx55 = getelementptr inbounds i16, ptr %g, i32 %sub53 262 %20 = load i16, ptr %arrayidx55, align 2 263 %conv56 = zext i16 %20 to i32 264 %add57 = add nsw i32 %add48, %conv56 265 %incdec.ptr58 = getelementptr inbounds i16, ptr %a.addr.0101, i64 8 266 %21 = load i16, ptr %incdec.ptr49, align 2 267 %conv59 = zext i16 %21 to i32 268 %22 = load i16, ptr %incdec.ptr51, align 2 269 %conv61 = zext i16 %22 to i32 270 %sub62 = sub nsw i32 %conv59, %conv61 271 %arrayidx64 = getelementptr inbounds i16, ptr %g, i32 %sub62 272 %23 = load i16, ptr %arrayidx64, align 2 273 %conv65 = zext i16 %23 to i32 274 %add66 = add nsw i32 %add57, %conv65 275 %inc = add nuw nsw i32 %i.0103, 1 276 %exitcond = icmp eq i32 %inc, %n 277 br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body 278} 279 280define i32 @gather_reduce_8x16_i64(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture readonly %g, i32 %n) { 281; GENERIC-LABEL: @gather_reduce_8x16_i64( 282; GENERIC-NEXT: entry: 283; GENERIC-NEXT: [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0 284; GENERIC-NEXT: br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] 285; GENERIC: for.body.preheader: 286; GENERIC-NEXT: br label [[FOR_BODY:%.*]] 287; GENERIC: for.cond.cleanup.loopexit: 288; GENERIC-NEXT: br label [[FOR_COND_CLEANUP]] 289; GENERIC: for.cond.cleanup: 290; GENERIC-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] 291; GENERIC-NEXT: ret i32 [[SUM_0_LCSSA]] 292; GENERIC: for.body: 293; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 294; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 295; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] 296; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16 297; GENERIC-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2 298; GENERIC-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32> 299; GENERIC-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2 300; GENERIC-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> 301; GENERIC-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]] 302; GENERIC-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0 303; GENERIC-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 304; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]] 305; GENERIC-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 306; GENERIC-NEXT: [[CONV3:%.*]] = zext i16 [[TMP7]] to i32 307; GENERIC-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]] 308; GENERIC-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1 309; GENERIC-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 310; GENERIC-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]] 311; GENERIC-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 312; GENERIC-NEXT: [[CONV11:%.*]] = zext i16 [[TMP10]] to i32 313; GENERIC-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]] 314; GENERIC-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2 315; GENERIC-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 316; GENERIC-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]] 317; GENERIC-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2 318; GENERIC-NEXT: [[CONV20:%.*]] = zext i16 [[TMP13]] to i32 319; GENERIC-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]] 320; GENERIC-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3 321; GENERIC-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 322; GENERIC-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]] 323; GENERIC-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2 324; GENERIC-NEXT: [[CONV29:%.*]] = zext i16 [[TMP16]] to i32 325; GENERIC-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]] 326; GENERIC-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4 327; GENERIC-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 328; GENERIC-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]] 329; GENERIC-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2 330; GENERIC-NEXT: [[CONV38:%.*]] = zext i16 [[TMP19]] to i32 331; GENERIC-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]] 332; GENERIC-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5 333; GENERIC-NEXT: [[TMP21:%.*]] = sext i32 [[TMP20]] to i64 334; GENERIC-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]] 335; GENERIC-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2 336; GENERIC-NEXT: [[CONV47:%.*]] = zext i16 [[TMP22]] to i32 337; GENERIC-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]] 338; GENERIC-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6 339; GENERIC-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64 340; GENERIC-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]] 341; GENERIC-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2 342; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP25]] to i32 343; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] 344; GENERIC-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7 345; GENERIC-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64 346; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]] 347; GENERIC-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2 348; GENERIC-NEXT: [[CONV65:%.*]] = zext i16 [[TMP28]] to i32 349; GENERIC-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] 350; GENERIC-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1 351; GENERIC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] 352; GENERIC-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] 353; 354; KRYO-LABEL: @gather_reduce_8x16_i64( 355; KRYO-NEXT: entry: 356; KRYO-NEXT: [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0 357; KRYO-NEXT: br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] 358; KRYO: for.body.preheader: 359; KRYO-NEXT: br label [[FOR_BODY:%.*]] 360; KRYO: for.cond.cleanup.loopexit: 361; KRYO-NEXT: br label [[FOR_COND_CLEANUP]] 362; KRYO: for.cond.cleanup: 363; KRYO-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] 364; KRYO-NEXT: ret i32 [[SUM_0_LCSSA]] 365; KRYO: for.body: 366; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 367; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 368; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] 369; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16 370; KRYO-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2 371; KRYO-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32> 372; KRYO-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2 373; KRYO-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> 374; KRYO-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]] 375; KRYO-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0 376; KRYO-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 377; KRYO-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]] 378; KRYO-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 379; KRYO-NEXT: [[CONV3:%.*]] = zext i16 [[TMP7]] to i32 380; KRYO-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]] 381; KRYO-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1 382; KRYO-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 383; KRYO-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]] 384; KRYO-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2 385; KRYO-NEXT: [[CONV11:%.*]] = zext i16 [[TMP10]] to i32 386; KRYO-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]] 387; KRYO-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2 388; KRYO-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 389; KRYO-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]] 390; KRYO-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2 391; KRYO-NEXT: [[CONV20:%.*]] = zext i16 [[TMP13]] to i32 392; KRYO-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]] 393; KRYO-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3 394; KRYO-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 395; KRYO-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]] 396; KRYO-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2 397; KRYO-NEXT: [[CONV29:%.*]] = zext i16 [[TMP16]] to i32 398; KRYO-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]] 399; KRYO-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4 400; KRYO-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 401; KRYO-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]] 402; KRYO-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2 403; KRYO-NEXT: [[CONV38:%.*]] = zext i16 [[TMP19]] to i32 404; KRYO-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]] 405; KRYO-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5 406; KRYO-NEXT: [[TMP21:%.*]] = sext i32 [[TMP20]] to i64 407; KRYO-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]] 408; KRYO-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2 409; KRYO-NEXT: [[CONV47:%.*]] = zext i16 [[TMP22]] to i32 410; KRYO-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]] 411; KRYO-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6 412; KRYO-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64 413; KRYO-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]] 414; KRYO-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2 415; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP25]] to i32 416; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] 417; KRYO-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7 418; KRYO-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64 419; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]] 420; KRYO-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2 421; KRYO-NEXT: [[CONV65:%.*]] = zext i16 [[TMP28]] to i32 422; KRYO-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] 423; KRYO-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1 424; KRYO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] 425; KRYO-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] 426; 427entry: 428 %cmp.99 = icmp sgt i32 %n, 0 429 br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup 430 431for.body.preheader: 432 br label %for.body 433 434for.cond.cleanup.loopexit: 435 br label %for.cond.cleanup 436 437for.cond.cleanup: 438 %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ] 439 ret i32 %sum.0.lcssa 440 441for.body: 442 %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 443 %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ] 444 %a.addr.0101 = phi ptr [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ] 445 %incdec.ptr = getelementptr inbounds i16, ptr %a.addr.0101, i64 1 446 %0 = load i16, ptr %a.addr.0101, align 2 447 %conv = zext i16 %0 to i64 448 %incdec.ptr1 = getelementptr inbounds i16, ptr %b, i64 1 449 %1 = load i16, ptr %b, align 2 450 %conv2 = zext i16 %1 to i64 451 %sub = sub nsw i64 %conv, %conv2 452 %arrayidx = getelementptr inbounds i16, ptr %g, i64 %sub 453 %2 = load i16, ptr %arrayidx, align 2 454 %conv3 = zext i16 %2 to i32 455 %add = add nsw i32 %conv3, %sum.0102 456 %incdec.ptr4 = getelementptr inbounds i16, ptr %a.addr.0101, i64 2 457 %3 = load i16, ptr %incdec.ptr, align 2 458 %conv5 = zext i16 %3 to i64 459 %incdec.ptr6 = getelementptr inbounds i16, ptr %b, i64 2 460 %4 = load i16, ptr %incdec.ptr1, align 2 461 %conv7 = zext i16 %4 to i64 462 %sub8 = sub nsw i64 %conv5, %conv7 463 %arrayidx10 = getelementptr inbounds i16, ptr %g, i64 %sub8 464 %5 = load i16, ptr %arrayidx10, align 2 465 %conv11 = zext i16 %5 to i32 466 %add12 = add nsw i32 %add, %conv11 467 %incdec.ptr13 = getelementptr inbounds i16, ptr %a.addr.0101, i64 3 468 %6 = load i16, ptr %incdec.ptr4, align 2 469 %conv14 = zext i16 %6 to i64 470 %incdec.ptr15 = getelementptr inbounds i16, ptr %b, i64 3 471 %7 = load i16, ptr %incdec.ptr6, align 2 472 %conv16 = zext i16 %7 to i64 473 %sub17 = sub nsw i64 %conv14, %conv16 474 %arrayidx19 = getelementptr inbounds i16, ptr %g, i64 %sub17 475 %8 = load i16, ptr %arrayidx19, align 2 476 %conv20 = zext i16 %8 to i32 477 %add21 = add nsw i32 %add12, %conv20 478 %incdec.ptr22 = getelementptr inbounds i16, ptr %a.addr.0101, i64 4 479 %9 = load i16, ptr %incdec.ptr13, align 2 480 %conv23 = zext i16 %9 to i64 481 %incdec.ptr24 = getelementptr inbounds i16, ptr %b, i64 4 482 %10 = load i16, ptr %incdec.ptr15, align 2 483 %conv25 = zext i16 %10 to i64 484 %sub26 = sub nsw i64 %conv23, %conv25 485 %arrayidx28 = getelementptr inbounds i16, ptr %g, i64 %sub26 486 %11 = load i16, ptr %arrayidx28, align 2 487 %conv29 = zext i16 %11 to i32 488 %add30 = add nsw i32 %add21, %conv29 489 %incdec.ptr31 = getelementptr inbounds i16, ptr %a.addr.0101, i64 5 490 %12 = load i16, ptr %incdec.ptr22, align 2 491 %conv32 = zext i16 %12 to i64 492 %incdec.ptr33 = getelementptr inbounds i16, ptr %b, i64 5 493 %13 = load i16, ptr %incdec.ptr24, align 2 494 %conv34 = zext i16 %13 to i64 495 %sub35 = sub nsw i64 %conv32, %conv34 496 %arrayidx37 = getelementptr inbounds i16, ptr %g, i64 %sub35 497 %14 = load i16, ptr %arrayidx37, align 2 498 %conv38 = zext i16 %14 to i32 499 %add39 = add nsw i32 %add30, %conv38 500 %incdec.ptr40 = getelementptr inbounds i16, ptr %a.addr.0101, i64 6 501 %15 = load i16, ptr %incdec.ptr31, align 2 502 %conv41 = zext i16 %15 to i64 503 %incdec.ptr42 = getelementptr inbounds i16, ptr %b, i64 6 504 %16 = load i16, ptr %incdec.ptr33, align 2 505 %conv43 = zext i16 %16 to i64 506 %sub44 = sub nsw i64 %conv41, %conv43 507 %arrayidx46 = getelementptr inbounds i16, ptr %g, i64 %sub44 508 %17 = load i16, ptr %arrayidx46, align 2 509 %conv47 = zext i16 %17 to i32 510 %add48 = add nsw i32 %add39, %conv47 511 %incdec.ptr49 = getelementptr inbounds i16, ptr %a.addr.0101, i64 7 512 %18 = load i16, ptr %incdec.ptr40, align 2 513 %conv50 = zext i16 %18 to i64 514 %incdec.ptr51 = getelementptr inbounds i16, ptr %b, i64 7 515 %19 = load i16, ptr %incdec.ptr42, align 2 516 %conv52 = zext i16 %19 to i64 517 %sub53 = sub nsw i64 %conv50, %conv52 518 %arrayidx55 = getelementptr inbounds i16, ptr %g, i64 %sub53 519 %20 = load i16, ptr %arrayidx55, align 2 520 %conv56 = zext i16 %20 to i32 521 %add57 = add nsw i32 %add48, %conv56 522 %incdec.ptr58 = getelementptr inbounds i16, ptr %a.addr.0101, i64 8 523 %21 = load i16, ptr %incdec.ptr49, align 2 524 %conv59 = zext i16 %21 to i64 525 %22 = load i16, ptr %incdec.ptr51, align 2 526 %conv61 = zext i16 %22 to i64 527 %sub62 = sub nsw i64 %conv59, %conv61 528 %arrayidx64 = getelementptr inbounds i16, ptr %g, i64 %sub62 529 %23 = load i16, ptr %arrayidx64, align 2 530 %conv65 = zext i16 %23 to i32 531 %add66 = add nsw i32 %add57, %conv65 532 %inc = add nuw nsw i32 %i.0103, 1 533 %exitcond = icmp eq i32 %inc, %n 534 br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body 535} 536