1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=loop-vectorize,simplifycfg -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 3; RUN: opt < %s -passes=loop-vectorize,simplifycfg -mcpu=knl -force-vector-width=2 -force-target-max-vector-interleave=1 -S | FileCheck %s -check-prefix=FVW2 4 5; With a force-vector-width, it is sometimes more profitable to generate 6; scalarized and predicated stores instead of masked scatter. Disable 7; interleaving to simplify CHECKs in that scenario. 8 9target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 10target triple = "x86_64-pc_linux" 11 12; The source code: 13; 14;void foo1(ptr __restrict__ in, ptr __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) { 15; 16; for (int i=0; i < SIZE; ++i) { 17; if (trigger[i] > 0) { 18; out[i] = in[index[i]] + (float) 0.5; 19; } 20; } 21;} 22 23; Function Attrs: nounwind uwtable 24define void @foo1(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr noalias %index) { 25; AVX512-LABEL: @foo1( 26; AVX512-NEXT: entry: 27; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 28; AVX512: vector.body: 29; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 30; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX1]], 0 31; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]] 32; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 33; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4 34; AVX512-NEXT: [[TMP3:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer 35; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[INDEX:%.*]], i64 [[TMP0]] 36; AVX512-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 37; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison) 38; AVX512-NEXT: [[TMP6:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64> 39; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <16 x i64> [[TMP6]] 40; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP7]], i32 4, <16 x i1> [[TMP3]], <16 x float> poison) 41; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01) 42; AVX512-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[OUT:%.*]], i64 [[TMP0]] 43; AVX512-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[TMP9]], i32 0 44; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP8]], ptr [[TMP10]], i32 4, <16 x i1> [[TMP3]]) 45; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16 46; AVX512-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 47; AVX512-NEXT: br i1 [[TMP11]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] 48; AVX512: for.end: 49; AVX512-NEXT: ret void 50; 51; FVW2-LABEL: @foo1( 52; FVW2-NEXT: entry: 53; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 54; FVW2: vector.body: 55; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 56; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX1]], 0 57; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]] 58; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 59; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 60; FVW2-NEXT: [[TMP3:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer 61; FVW2-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[INDEX:%.*]], i64 [[TMP0]] 62; FVW2-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 63; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP5]], i32 4, <2 x i1> [[TMP3]], <2 x i32> poison) 64; FVW2-NEXT: [[TMP6:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64> 65; FVW2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <2 x i64> [[TMP6]] 66; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP7]], i32 4, <2 x i1> [[TMP3]], <2 x float> poison) 67; FVW2-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01) 68; FVW2-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[OUT:%.*]], i64 [[TMP0]] 69; FVW2-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[TMP9]], i32 0 70; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0(<2 x float> [[TMP8]], ptr [[TMP10]], i32 4, <2 x i1> [[TMP3]]) 71; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2 72; FVW2-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 73; FVW2-NEXT: br i1 [[TMP11]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] 74; FVW2: for.end: 75; FVW2-NEXT: ret void 76; 77entry: 78 br label %for.body 79 80for.body: 81 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 82 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv 83 %0 = load i32, ptr %arrayidx, align 4 84 %cmp1 = icmp sgt i32 %0, 0 85 br i1 %cmp1, label %if.then, label %for.inc 86 87if.then: 88 %arrayidx3 = getelementptr inbounds i32, ptr %index, i64 %indvars.iv 89 %1 = load i32, ptr %arrayidx3, align 4 90 %idxprom4 = sext i32 %1 to i64 91 %arrayidx5 = getelementptr inbounds float, ptr %in, i64 %idxprom4 92 %2 = load float, ptr %arrayidx5, align 4 93 %add = fadd float %2, 5.000000e-01 94 %arrayidx7 = getelementptr inbounds float, ptr %out, i64 %indvars.iv 95 store float %add, ptr %arrayidx7, align 4 96 br label %for.inc 97 98for.inc: 99 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 100 %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 101 br i1 %exitcond.not, label %for.end, label %for.body 102 103for.end: 104 ret void 105} 106 107; The source code 108;void foo2 (In * __restrict__ in, ptr __restrict__ out, int * __restrict__ trigger) { 109; 110; for (int i=0; i<SIZE; i += 16) { 111; if (trigger[i] > 0) { 112; out[i] = in[i].b + (float) 0.5; 113; } 114; } 115;} 116 117%struct.In = type { float, float } 118 119define void @foo2(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr noalias %index) #0 { 120; AVX512-LABEL: @foo2( 121; AVX512-NEXT: entry: 122; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 123; AVX512: vector.body: 124; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 125; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 126; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] 127; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison) 128; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer 129; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 130; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison) 131; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01) 132; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]] 133; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) 134; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16 135; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256) 136; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 137; AVX512-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] 138; AVX512: for.end: 139; AVX512-NEXT: ret void 140; 141; FVW2-LABEL: @foo2( 142; FVW2-NEXT: entry: 143; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 144; FVW2: vector.body: 145; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] 146; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ] 147; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16 148; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 149; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16 150; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]] 151; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]] 152; FVW2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4 153; FVW2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4 154; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 155; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1 156; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer 157; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 158; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison) 159; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01) 160; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 161; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] 162; FVW2: pred.store.if: 163; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]] 164; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 165; FVW2-NEXT: store float [[TMP13]], ptr [[TMP12]], align 4 166; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] 167; FVW2: pred.store.continue: 168; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 169; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] 170; FVW2: pred.store.if2: 171; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP1]] 172; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 173; FVW2-NEXT: store float [[TMP16]], ptr [[TMP15]], align 4 174; FVW2-NEXT: br label [[PRED_STORE_CONTINUE3]] 175; FVW2: pred.store.continue3: 176; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2 177; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32) 178; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 179; FVW2-NEXT: br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] 180; FVW2: for.end: 181; FVW2-NEXT: ret void 182; 183entry: 184 br label %for.body 185 186for.body: 187 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 188 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv 189 %0 = load i32, ptr %arrayidx, align 4 190 %cmp1 = icmp sgt i32 %0, 0 191 br i1 %cmp1, label %if.then, label %for.inc 192 193if.then: 194 %b = getelementptr inbounds %struct.In, ptr %in, i64 %indvars.iv, i32 1 195 %1 = load float, ptr %b, align 4 196 %add = fadd float %1, 5.000000e-01 197 %arrayidx5 = getelementptr inbounds float, ptr %out, i64 %indvars.iv 198 store float %add, ptr %arrayidx5, align 4 199 br label %for.inc 200 201for.inc: 202 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16 203 %cmp = icmp ult i64 %indvars.iv, 4080 204 br i1 %cmp, label %for.body, label %for.end 205 206for.end: 207 ret void 208} 209 210; The source code 211;struct Out { 212; float a; 213; float b; 214;}; 215;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) { 216; 217; for (int i=0; i<SIZE; i += 16) { 218; if (trigger[i] > 0) { 219; out[i].b = in[i].b + (float) 0.5; 220; } 221; } 222;} 223 224%struct.Out = type { float, float } 225 226define void @foo3(ptr noalias %in, ptr noalias %out, ptr noalias %trigger) { 227; AVX512-LABEL: @foo3( 228; AVX512-NEXT: entry: 229; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 230; AVX512: vector.body: 231; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 232; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 233; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] 234; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison) 235; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer 236; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 237; AVX512-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison) 238; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER1]], splat (float 5.000000e-01) 239; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]], i32 1 240; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) 241; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 242; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256) 243; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 244; AVX512-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] 245; AVX512: for.end: 246; AVX512-NEXT: ret void 247; 248; FVW2-LABEL: @foo3( 249; FVW2-NEXT: entry: 250; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 251; FVW2: vector.body: 252; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] 253; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ] 254; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 255; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 256; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16 257; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]] 258; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]] 259; FVW2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4 260; FVW2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4 261; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 262; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1 263; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer 264; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 265; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison) 266; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01) 267; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 268; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] 269; FVW2: pred.store.if: 270; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], ptr [[OUT:%.*]], i64 [[TMP0]], i32 1 271; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 272; FVW2-NEXT: store float [[TMP13]], ptr [[TMP12]], align 4 273; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] 274; FVW2: pred.store.continue: 275; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 276; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] 277; FVW2: pred.store.if1: 278; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_OUT]], ptr [[OUT]], i64 [[TMP1]], i32 1 279; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 280; FVW2-NEXT: store float [[TMP16]], ptr [[TMP15]], align 4 281; FVW2-NEXT: br label [[PRED_STORE_CONTINUE2]] 282; FVW2: pred.store.continue2: 283; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 284; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32) 285; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 286; FVW2-NEXT: br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] 287; FVW2: for.end: 288; FVW2-NEXT: ret void 289; 290entry: 291 br label %for.body 292 293for.body: 294 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 295 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv 296 %0 = load i32, ptr %arrayidx, align 4 297 %cmp1 = icmp sgt i32 %0, 0 298 br i1 %cmp1, label %if.then, label %for.inc 299 300if.then: 301 %b = getelementptr inbounds %struct.In, ptr %in, i64 %indvars.iv, i32 1 302 %1 = load float, ptr %b, align 4 303 %add = fadd float %1, 5.000000e-01 304 %b6 = getelementptr inbounds %struct.Out, ptr %out, i64 %indvars.iv, i32 1 305 store float %add, ptr %b6, align 4 306 br label %for.inc 307 308for.inc: 309 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16 310 %cmp = icmp ult i64 %indvars.iv, 4080 311 br i1 %cmp, label %for.body, label %for.end 312 313for.end: 314 ret void 315} 316declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>) 317 318; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1 319 320define void @foo2_addrspace(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out, ptr noalias %trigger, ptr noalias %index) #0 { 321; AVX512-LABEL: @foo2_addrspace( 322; AVX512-NEXT: entry: 323; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 324; AVX512: vector.body: 325; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 326; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 327; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] 328; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison) 329; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer 330; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 331; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison) 332; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01) 333; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], <16 x i64> [[VEC_IND]] 334; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) 335; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16 336; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256) 337; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 338; AVX512-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] 339; AVX512: for.end: 340; AVX512-NEXT: ret void 341; 342; FVW2-LABEL: @foo2_addrspace( 343; FVW2-NEXT: entry: 344; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 345; FVW2: vector.body: 346; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] 347; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ] 348; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16 349; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 350; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16 351; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]] 352; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]] 353; FVW2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4 354; FVW2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4 355; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 356; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1 357; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer 358; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 359; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison) 360; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01) 361; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 362; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] 363; FVW2: pred.store.if: 364; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], i64 [[TMP0]] 365; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 366; FVW2-NEXT: store float [[TMP13]], ptr addrspace(1) [[TMP12]], align 4 367; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] 368; FVW2: pred.store.continue: 369; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 370; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] 371; FVW2: pred.store.if2: 372; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT]], i64 [[TMP1]] 373; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 374; FVW2-NEXT: store float [[TMP16]], ptr addrspace(1) [[TMP15]], align 4 375; FVW2-NEXT: br label [[PRED_STORE_CONTINUE3]] 376; FVW2: pred.store.continue3: 377; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2 378; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32) 379; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 380; FVW2-NEXT: br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] 381; FVW2: for.end: 382; FVW2-NEXT: ret void 383; 384entry: 385 br label %for.body 386 387for.body: 388 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 389 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv 390 %0 = load i32, ptr %arrayidx, align 4 391 %cmp1 = icmp sgt i32 %0, 0 392 br i1 %cmp1, label %if.then, label %for.inc 393 394if.then: 395 %b = getelementptr inbounds %struct.In, ptr addrspace(1) %in, i64 %indvars.iv, i32 1 396 %1 = load float, ptr addrspace(1) %b, align 4 397 %add = fadd float %1, 5.000000e-01 398 %arrayidx5 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %indvars.iv 399 store float %add, ptr addrspace(1) %arrayidx5, align 4 400 br label %for.inc 401 402for.inc: 403 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16 404 %cmp = icmp ult i64 %indvars.iv, 4080 405 br i1 %cmp, label %for.body, label %for.end 406 407for.end: 408 ret void 409} 410 411; Same as foo2_addrspace but here only the input has the non-default address space. 412 413define void @foo2_addrspace2(ptr addrspace(1) noalias %in, ptr addrspace(0) noalias %out, ptr noalias %trigger, ptr noalias %index) { 414; AVX512-LABEL: @foo2_addrspace2( 415; AVX512-NEXT: entry: 416; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 417; AVX512: vector.body: 418; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 419; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 420; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] 421; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison) 422; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer 423; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 424; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison) 425; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01) 426; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]] 427; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) 428; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16 429; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256) 430; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 431; AVX512-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] 432; AVX512: for.end: 433; AVX512-NEXT: ret void 434; 435; FVW2-LABEL: @foo2_addrspace2( 436; FVW2-NEXT: entry: 437; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 438; FVW2: vector.body: 439; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] 440; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ] 441; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16 442; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 443; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16 444; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]] 445; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]] 446; FVW2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4 447; FVW2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4 448; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 449; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1 450; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer 451; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 452; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison) 453; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01) 454; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 455; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] 456; FVW2: pred.store.if: 457; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]] 458; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 459; FVW2-NEXT: store float [[TMP13]], ptr [[TMP12]], align 4 460; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] 461; FVW2: pred.store.continue: 462; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 463; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] 464; FVW2: pred.store.if2: 465; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP1]] 466; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 467; FVW2-NEXT: store float [[TMP16]], ptr [[TMP15]], align 4 468; FVW2-NEXT: br label [[PRED_STORE_CONTINUE3]] 469; FVW2: pred.store.continue3: 470; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2 471; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32) 472; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 473; FVW2-NEXT: br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] 474; FVW2: for.end: 475; FVW2-NEXT: ret void 476; 477entry: 478 br label %for.body 479 480for.body: 481 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 482 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv 483 %0 = load i32, ptr %arrayidx, align 4 484 %cmp1 = icmp sgt i32 %0, 0 485 br i1 %cmp1, label %if.then, label %for.inc 486 487if.then: 488 %b = getelementptr inbounds %struct.In, ptr addrspace(1) %in, i64 %indvars.iv, i32 1 489 %1 = load float, ptr addrspace(1) %b, align 4 490 %add = fadd float %1, 5.000000e-01 491 %arrayidx5 = getelementptr inbounds float, ptr %out, i64 %indvars.iv 492 store float %add, ptr %arrayidx5, align 4 493 br label %for.inc 494 495for.inc: 496 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16 497 %cmp = icmp ult i64 %indvars.iv, 4080 498 br i1 %cmp, label %for.body, label %for.end 499 500for.end: 501 ret void 502} 503 504; Same as foo2_addrspace but here only the output has the non-default address space. 505 506define void @foo2_addrspace3(ptr addrspace(0) noalias %in, ptr addrspace(1) noalias %out, ptr noalias %trigger, ptr noalias %index) { 507; AVX512-LABEL: @foo2_addrspace3( 508; AVX512-NEXT: entry: 509; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 510; AVX512: vector.body: 511; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 512; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 513; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] 514; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison) 515; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer 516; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 517; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison) 518; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01) 519; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], <16 x i64> [[VEC_IND]] 520; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) 521; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16 522; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256) 523; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 524; AVX512-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] 525; AVX512: for.end: 526; AVX512-NEXT: ret void 527; 528; FVW2-LABEL: @foo2_addrspace3( 529; FVW2-NEXT: entry: 530; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 531; FVW2: vector.body: 532; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] 533; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ] 534; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16 535; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 536; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16 537; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]] 538; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]] 539; FVW2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4 540; FVW2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4 541; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 542; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1 543; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer 544; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 545; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison) 546; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01) 547; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 548; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] 549; FVW2: pred.store.if: 550; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], i64 [[TMP0]] 551; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 552; FVW2-NEXT: store float [[TMP13]], ptr addrspace(1) [[TMP12]], align 4 553; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] 554; FVW2: pred.store.continue: 555; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 556; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] 557; FVW2: pred.store.if2: 558; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT]], i64 [[TMP1]] 559; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 560; FVW2-NEXT: store float [[TMP16]], ptr addrspace(1) [[TMP15]], align 4 561; FVW2-NEXT: br label [[PRED_STORE_CONTINUE3]] 562; FVW2: pred.store.continue3: 563; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2 564; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32) 565; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 566; FVW2-NEXT: br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] 567; FVW2: for.end: 568; FVW2-NEXT: ret void 569; 570entry: 571 br label %for.body 572 573for.body: 574 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 575 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv 576 %0 = load i32, ptr %arrayidx, align 4 577 %cmp1 = icmp sgt i32 %0, 0 578 br i1 %cmp1, label %if.then, label %for.inc 579 580if.then: 581 %b = getelementptr inbounds %struct.In, ptr %in, i64 %indvars.iv, i32 1 582 %1 = load float, ptr %b, align 4 583 %add = fadd float %1, 5.000000e-01 584 %arrayidx5 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %indvars.iv 585 store float %add, ptr addrspace(1) %arrayidx5, align 4 586 br label %for.inc 587 588for.inc: 589 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16 590 %cmp = icmp ult i64 %indvars.iv, 4080 591 br i1 %cmp, label %for.body, label %for.end 592 593for.end: 594 ret void 595} 596 597; Using gathers is not profitable for this function. PR48429. 598define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %ptr, ptr nocapture noalias %dest) { 599; AVX512-LABEL: @test_gather_not_profitable_pr48429( 600; AVX512-NEXT: entry: 601; AVX512-NEXT: [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64 602; AVX512-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[PTR:%.*]], i64 [[IDX_EXT]] 603; AVX512-NEXT: [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0 604; AVX512-NEXT: br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[ITER_CHECK:%.*]] 605; AVX512: iter.check: 606; AVX512-NEXT: [[MUL:%.*]] = sub nsw i32 0, [[D]] 607; AVX512-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 608; AVX512-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2 609; AVX512-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4 610; AVX512-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2 611; AVX512-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 612; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 613; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 614; AVX512: vector.memcheck: 615; AVX512-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2 616; AVX512-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4 617; AVX512-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 2 618; AVX512-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 6 619; AVX512-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 8 620; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DEST:%.*]], i64 [[TMP8]] 621; AVX512-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP6]], 2 622; AVX512-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 4 623; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP10]] 624; AVX512-NEXT: [[TMP11:%.*]] = mul nsw i64 [[IDX_EXT]], -4 625; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP11]] 626; AVX512-NEXT: [[TMP12:%.*]] = sub i64 [[TMP10]], [[TMP4]] 627; AVX512-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP12]] 628; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP1]] 629; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[PTR]], [[SCEVGEP]] 630; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 631; AVX512-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP3]] 632; AVX512-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP]] 633; AVX512-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] 634; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] 635; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] 636; AVX512: vector.main.loop.iter.check: 637; AVX512-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP3]], 16 638; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] 639; AVX512: vector.ph: 640; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16 641; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] 642; AVX512-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 64 643; AVX512-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP13]] 644; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 645; AVX512: vector.body: 646; AVX512-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] 647; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 648; AVX512-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <16 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448, i64 512, i64 576, i64 640, i64 704, i64 768, i64 832, i64 896, i64 960> 649; AVX512-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 650; AVX512-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0 651; AVX512-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP15]] 652; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM]] 653; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 654; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP18]], align 4, !alias.scope [[META8:![0-9]+]] 655; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD]], <16 x ptr> [[TMP14]], i32 4, <16 x i1> splat (i1 true)), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] 656; AVX512-NEXT: [[TMP19:%.*]] = getelementptr float, ptr [[TMP16]], i32 0 657; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x float>, ptr [[TMP19]], align 4, !alias.scope [[META15:![0-9]+]] 658; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, <16 x ptr> [[TMP14]], i64 1 659; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD8]], <16 x ptr> [[TMP20]], i32 4, <16 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]] 660; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 661; AVX512-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 1024 662; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 663; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] 664; AVX512: middle.block: 665; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] 666; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] 667; AVX512: vec.epilog.iter.check: 668; AVX512-NEXT: [[TMP23:%.*]] = mul i64 [[N_VEC]], 4 669; AVX512-NEXT: [[IND_END12:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP23]] 670; AVX512-NEXT: [[TMP38:%.*]] = mul i64 [[N_VEC]], 64 671; AVX512-NEXT: [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP38]] 672; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] 673; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 674; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] 675; AVX512: vec.epilog.ph: 676; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] 677; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] 678; AVX512-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[TMP3]], 8 679; AVX512-NEXT: [[N_VEC10:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF9]] 680; AVX512-NEXT: [[TMP24:%.*]] = mul i64 [[N_VEC10]], 4 681; AVX512-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP24]] 682; AVX512-NEXT: [[TMP25:%.*]] = mul i64 [[N_VEC10]], 64 683; AVX512-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP25]] 684; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] 685; AVX512: vec.epilog.vector.body: 686; AVX512-NEXT: [[POINTER_PHI19:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[PTR_IND20:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] 687; AVX512-NEXT: [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT24:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] 688; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[POINTER_PHI19]], <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448> 689; AVX512-NEXT: [[OFFSET_IDX21:%.*]] = mul i64 [[INDEX18]], 4 690; AVX512-NEXT: [[TMP27:%.*]] = add i64 [[OFFSET_IDX21]], 0 691; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP27]] 692; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[IDXPROM]] 693; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0 694; AVX512-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]] 695; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD17]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]] 696; AVX512-NEXT: [[TMP31:%.*]] = getelementptr float, ptr [[TMP28]], i32 0 697; AVX512-NEXT: [[WIDE_LOAD18:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]] 698; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1 699; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD18]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]] 700; AVX512-NEXT: [[INDEX_NEXT24]] = add nuw i64 [[INDEX18]], 8 701; AVX512-NEXT: [[PTR_IND20]] = getelementptr i8, ptr [[POINTER_PHI19]], i64 512 702; AVX512-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]] 703; AVX512-NEXT: br i1 [[TMP33]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] 704; AVX512: vec.epilog.middle.block: 705; AVX512-NEXT: [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]] 706; AVX512-NEXT: br i1 [[CMP_N17]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] 707; AVX512: vec.epilog.scalar.ph: 708; AVX512-NEXT: [[BC_RESUME_VAL13:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[ITER_CHECK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ] 709; AVX512-NEXT: [[BC_RESUME_VAL16:%.*]] = phi ptr [ [[IND_END14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ], [ [[DEST]], [[ITER_CHECK]] ], [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ] 710; AVX512-NEXT: br label [[FOR_BODY:%.*]] 711; AVX512: for.body: 712; AVX512-NEXT: [[PTR_ADDR_012:%.*]] = phi ptr [ [[BC_RESUME_VAL13]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] 713; AVX512-NEXT: [[DEST_ADDR_011:%.*]] = phi ptr [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ] 714; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 [[IDXPROM]] 715; AVX512-NEXT: [[TMP34:%.*]] = load float, ptr [[ARRAYIDX]], align 4 716; AVX512-NEXT: store float [[TMP34]], ptr [[DEST_ADDR_011]], align 4 717; AVX512-NEXT: [[TMP35:%.*]] = load float, ptr [[PTR_ADDR_012]], align 4 718; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 1 719; AVX512-NEXT: store float [[TMP35]], ptr [[ARRAYIDX5]], align 4 720; AVX512-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 1 721; AVX512-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 16 722; AVX512-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[ADD_PTR]] 723; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] 724; AVX512: for.end: 725; AVX512-NEXT: ret void 726; 727; FVW2-LABEL: @test_gather_not_profitable_pr48429( 728; FVW2-NEXT: entry: 729; FVW2-NEXT: [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64 730; FVW2-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[PTR:%.*]], i64 [[IDX_EXT]] 731; FVW2-NEXT: [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0 732; FVW2-NEXT: br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]] 733; FVW2: for.body.lr.ph: 734; FVW2-NEXT: [[MUL:%.*]] = sub nsw i32 0, [[D]] 735; FVW2-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 736; FVW2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2 737; FVW2-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4 738; FVW2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2 739; FVW2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 740; FVW2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 2 741; FVW2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 742; FVW2: vector.memcheck: 743; FVW2-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2 744; FVW2-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4 745; FVW2-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 2 746; FVW2-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 6 747; FVW2-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 8 748; FVW2-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DEST:%.*]], i64 [[TMP8]] 749; FVW2-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP6]], 2 750; FVW2-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 4 751; FVW2-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP10]] 752; FVW2-NEXT: [[TMP11:%.*]] = mul nsw i64 [[IDX_EXT]], -4 753; FVW2-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP11]] 754; FVW2-NEXT: [[TMP12:%.*]] = sub i64 [[TMP10]], [[TMP4]] 755; FVW2-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP12]] 756; FVW2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP1]] 757; FVW2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[PTR]], [[SCEVGEP]] 758; FVW2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 759; FVW2-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP3]] 760; FVW2-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP]] 761; FVW2-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] 762; FVW2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] 763; FVW2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 764; FVW2: vector.ph: 765; FVW2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2 766; FVW2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] 767; FVW2-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 4 768; FVW2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP13]] 769; FVW2-NEXT: [[TMP14:%.*]] = mul i64 [[N_VEC]], 64 770; FVW2-NEXT: [[IND_END7:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP14]] 771; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 772; FVW2: vector.body: 773; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 774; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 775; FVW2-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0 776; FVW2-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP15]] 777; FVW2-NEXT: [[OFFSET_IDX9:%.*]] = mul i64 [[INDEX]], 64 778; FVW2-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX9]], 0 779; FVW2-NEXT: [[TMP18:%.*]] = add i64 [[OFFSET_IDX9]], 64 780; FVW2-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP17]] 781; FVW2-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP18]] 782; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM]] 783; FVW2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 0 784; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP22]], align 4, !alias.scope [[META8:![0-9]+]] 785; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 786; FVW2-NEXT: store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] 787; FVW2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 788; FVW2-NEXT: store float [[TMP24]], ptr [[TMP20]], align 4, !alias.scope [[META11]], !noalias [[META13]] 789; FVW2-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[TMP16]], i32 0 790; FVW2-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP25]], align 4, !alias.scope [[META15:![0-9]+]] 791; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 1 792; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 1 793; FVW2-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 0 794; FVW2-NEXT: store float [[TMP28]], ptr [[TMP26]], align 4, !alias.scope [[META11]], !noalias [[META13]] 795; FVW2-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 1 796; FVW2-NEXT: store float [[TMP29]], ptr [[TMP27]], align 4, !alias.scope [[META11]], !noalias [[META13]] 797; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 798; FVW2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 799; FVW2-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] 800; FVW2: middle.block: 801; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] 802; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[SCALAR_PH]] 803; FVW2: scalar.ph: 804; FVW2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ] 805; FVW2-NEXT: [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ] 806; FVW2-NEXT: br label [[FOR_BODY:%.*]] 807; FVW2: for.body: 808; FVW2-NEXT: [[PTR_ADDR_012:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] 809; FVW2-NEXT: [[DEST_ADDR_011:%.*]] = phi ptr [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ] 810; FVW2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 [[IDXPROM]] 811; FVW2-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX]], align 4 812; FVW2-NEXT: store float [[TMP31]], ptr [[DEST_ADDR_011]], align 4 813; FVW2-NEXT: [[TMP32:%.*]] = load float, ptr [[PTR_ADDR_012]], align 4 814; FVW2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 1 815; FVW2-NEXT: store float [[TMP32]], ptr [[ARRAYIDX5]], align 4 816; FVW2-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 1 817; FVW2-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 16 818; FVW2-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[ADD_PTR]] 819; FVW2-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] 820; FVW2: for.end: 821; FVW2-NEXT: ret void 822; 823entry: 824 %idx.ext = sext i32 %d to i64 825 %add.ptr = getelementptr inbounds float, ptr %ptr, i64 %idx.ext 826 %cmp.not10 = icmp eq i32 %d, 0 827 br i1 %cmp.not10, label %for.end, label %for.body.lr.ph 828 829for.body.lr.ph: 830 %mul = sub nsw i32 0, %d 831 %idxprom = sext i32 %mul to i64 832 br label %for.body 833 834for.body: 835 %ptr.addr.012 = phi ptr [ %ptr, %for.body.lr.ph ], [ %incdec.ptr, %for.body ] 836 %dest.addr.011 = phi ptr [ %dest, %for.body.lr.ph ], [ %add.ptr6, %for.body ] 837 %arrayidx = getelementptr inbounds float, ptr %ptr.addr.012, i64 %idxprom 838 %0 = load float, ptr %arrayidx, align 4 839 store float %0, ptr %dest.addr.011, align 4 840 %1 = load float, ptr %ptr.addr.012, align 4 841 %arrayidx5 = getelementptr inbounds float, ptr %dest.addr.011, i64 1 842 store float %1, ptr %arrayidx5, align 4 843 %incdec.ptr = getelementptr inbounds float, ptr %ptr.addr.012, i64 1 844 %add.ptr6 = getelementptr inbounds float, ptr %dest.addr.011, i64 16 845 %cmp.not = icmp eq ptr %incdec.ptr, %add.ptr 846 br i1 %cmp.not, label %for.end, label %for.body 847 848for.end: 849 ret void 850} 851