1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s 3 4; The following functions should all fail to become tail-predicated. 5; CHECK-NOT: call i32 @llvm.arm.vctp 6 7; trip.count.minus.1 has been inserted into element 1, not 0. 8define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_0(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { 9entry: 10 %cmp8 = icmp eq i32 %N, 0 11 %tmp8 = add i32 %N, 3 12 %tmp9 = lshr i32 %tmp8, 2 13 %tmp10 = shl nuw i32 %tmp9, 2 14 %tmp11 = add i32 %tmp10, -4 15 %tmp12 = lshr i32 %tmp11, 2 16 %tmp13 = add nuw nsw i32 %tmp12, 1 17 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 18 19vector.ph: ; preds = %entry 20 %trip.count.minus.1 = add i32 %N, -1 21 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1 22 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 23 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 24 br label %vector.body 25 26vector.body: ; preds = %vector.body, %vector.ph 27 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 28 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 29 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 30 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 31 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 32 %tmp = getelementptr inbounds i32, ptr %a, i32 %index 33 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 34 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 35 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index 36 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 37 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 38 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index 39 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1) 40 %index.next = add i32 %index, 4 41 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 42 %tmp16 = icmp ne i32 %tmp15, 0 43 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 44 45for.cond.cleanup: ; preds = %vector.body, %entry 46 ret void 47} 48 49; The insert isn't using an undef for operand 0. 50define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_def(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { 51entry: 52 %cmp8 = icmp eq i32 %N, 0 53 %tmp8 = add i32 %N, 3 54 %tmp9 = lshr i32 %tmp8, 2 55 %tmp10 = shl nuw i32 %tmp9, 2 56 %tmp11 = add i32 %tmp10, -4 57 %tmp12 = lshr i32 %tmp11, 2 58 %tmp13 = add nuw nsw i32 %tmp12, 1 59 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 60 61vector.ph: ; preds = %entry 62 %trip.count.minus.1 = add i32 %N, -1 63 %broadcast.splatinsert10 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 %trip.count.minus.1, i32 0 64 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 65 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 66 br label %vector.body 67 68vector.body: ; preds = %vector.body, %vector.ph 69 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 70 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 71 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 72 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 73 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 74 %tmp = getelementptr inbounds i32, ptr %a, i32 %index 75 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 76 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 77 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index 78 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 79 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 80 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index 81 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1) 82 %index.next = add i32 %index, 4 83 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 84 %tmp16 = icmp ne i32 %tmp15, 0 85 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 86 87for.cond.cleanup: ; preds = %vector.body, %entry 88 ret void 89} 90 91; The shuffle uses a defined value for operand 1. 92define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_1(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { 93entry: 94 %cmp8 = icmp eq i32 %N, 0 95 %tmp8 = add i32 %N, 3 96 %tmp9 = lshr i32 %tmp8, 2 97 %tmp10 = shl nuw i32 %tmp9, 2 98 %tmp11 = add i32 %tmp10, -4 99 %tmp12 = lshr i32 %tmp11, 2 100 %tmp13 = add nuw nsw i32 %tmp12, 1 101 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 102 103vector.ph: ; preds = %entry 104 %trip.count.minus.1 = add i32 %N, -1 105 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 106 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> zeroinitializer 107 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 108 br label %vector.body 109 110vector.body: ; preds = %vector.body, %vector.ph 111 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 112 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 113 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 114 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 115 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 116 %tmp = getelementptr inbounds i32, ptr %a, i32 %index 117 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 118 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 119 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index 120 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 121 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 122 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index 123 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1) 124 %index.next = add i32 %index, 4 125 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 126 %tmp16 = icmp ne i32 %tmp15, 0 127 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 128 129for.cond.cleanup: ; preds = %vector.body, %entry 130 ret void 131} 132 133; The shuffle uses a non zero value for operand 2. 134define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { 135entry: 136 %cmp8 = icmp eq i32 %N, 0 137 %tmp8 = add i32 %N, 3 138 %tmp9 = lshr i32 %tmp8, 2 139 %tmp10 = shl nuw i32 %tmp9, 2 140 %tmp11 = add i32 %tmp10, -4 141 %tmp12 = lshr i32 %tmp11, 2 142 %tmp13 = add nuw nsw i32 %tmp12, 1 143 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 144 145vector.ph: ; preds = %entry 146 %trip.count.minus.1 = add i32 %N, -1 147 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 148 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 149 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 150 br label %vector.body 151 152vector.body: ; preds = %vector.body, %vector.ph 153 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 154 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 155 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 156 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 157 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 158 %tmp = getelementptr inbounds i32, ptr %a, i32 %index 159 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 160 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 161 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index 162 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 163 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 164 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index 165 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1) 166 %index.next = add i32 %index, 4 167 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 168 %tmp16 = icmp ne i32 %tmp15, 0 169 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 170 171for.cond.cleanup: ; preds = %vector.body, %entry 172 ret void 173} 174 175; %N - 2 176define dso_local arm_aapcs_vfpcc void @trip_count_minus_2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { 177entry: 178 %cmp8 = icmp eq i32 %N, 0 179 %tmp8 = add i32 %N, 3 180 %tmp9 = lshr i32 %tmp8, 2 181 %tmp10 = shl nuw i32 %tmp9, 2 182 %tmp11 = add i32 %tmp10, -4 183 %tmp12 = lshr i32 %tmp11, 2 184 %tmp13 = add nuw nsw i32 %tmp12, 1 185 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 186 187vector.ph: ; preds = %entry 188 %trip.count.minus.2 = add i32 %N, -2 189 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1 190 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 191 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 192 br label %vector.body 193 194vector.body: ; preds = %vector.body, %vector.ph 195 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 196 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 197 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 198 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 199 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 200 %tmp = getelementptr inbounds i32, ptr %a, i32 %index 201 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 202 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 203 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index 204 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 205 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 206 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index 207 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1) 208 %index.next = add i32 %index, 4 209 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 210 %tmp16 = icmp ne i32 %tmp15, 0 211 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 212 213for.cond.cleanup: ; preds = %vector.body, %entry 214 ret void 215} 216 217; index has been inserted at element 1, not 0. 218define dso_local arm_aapcs_vfpcc void @wrong_loop_insert(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { 219entry: 220 %cmp8 = icmp eq i32 %N, 0 221 %tmp8 = add i32 %N, 3 222 %tmp9 = lshr i32 %tmp8, 2 223 %tmp10 = shl nuw i32 %tmp9, 2 224 %tmp11 = add i32 %tmp10, -4 225 %tmp12 = lshr i32 %tmp11, 2 226 %tmp13 = add nuw nsw i32 %tmp12, 1 227 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 228 229vector.ph: ; preds = %entry 230 %trip.count.minus.1 = add i32 %N, -1 231 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 232 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 233 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 234 br label %vector.body 235 236vector.body: ; preds = %vector.body, %vector.ph 237 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 238 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 239 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1 240 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 241 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 242 %tmp = getelementptr inbounds i32, ptr %a, i32 %index 243 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 244 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 245 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index 246 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 247 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 248 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index 249 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1) 250 %index.next = add i32 %index, 4 251 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 252 %tmp16 = icmp ne i32 %tmp15, 0 253 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 254 255for.cond.cleanup: ; preds = %vector.body, %entry 256 ret void 257} 258 259define dso_local arm_aapcs_vfpcc void @wrong_loop_invalid_index_splat(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { 260entry: 261 %cmp8 = icmp eq i32 %N, 0 262 %tmp8 = add i32 %N, 3 263 %tmp9 = lshr i32 %tmp8, 2 264 %tmp10 = shl nuw i32 %tmp9, 2 265 %tmp11 = add i32 %tmp10, -4 266 %tmp12 = lshr i32 %tmp11, 2 267 %tmp13 = add nuw nsw i32 %tmp12, 1 268 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 269 270vector.ph: ; preds = %entry 271 %trip.count.minus.1 = add i32 %N, -1 272 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 273 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 274 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 275 br label %vector.body 276 277vector.body: ; preds = %vector.body, %vector.ph 278 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 279 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 280 %incorrect = add i32 %index, 1 281 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0 282 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 283 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 284 %tmp = getelementptr inbounds i32, ptr %a, i32 %index 285 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 286 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 287 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index 288 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 289 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 290 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index 291 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1) 292 %index.next = add i32 %index, 4 293 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 294 %tmp16 = icmp ne i32 %tmp15, 0 295 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 296 297for.cond.cleanup: ; preds = %vector.body, %entry 298 ret void 299} 300 301; Now using ult, not ule for the vector icmp 302define dso_local arm_aapcs_vfpcc void @wrong_pred_opcode(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { 303entry: 304 %cmp8 = icmp eq i32 %N, 0 305 %tmp8 = add i32 %N, 3 306 %tmp9 = lshr i32 %tmp8, 2 307 %tmp10 = shl nuw i32 %tmp9, 2 308 %tmp11 = add i32 %tmp10, -4 309 %tmp12 = lshr i32 %tmp11, 2 310 %tmp13 = add nuw nsw i32 %tmp12, 1 311 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 312 313vector.ph: ; preds = %entry 314 %trip.count.minus.1 = add i32 %N, -1 315 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 316 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 317 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 318 br label %vector.body 319 320vector.body: ; preds = %vector.body, %vector.ph 321 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 322 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 323 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 324 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 325 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 326 %tmp = getelementptr inbounds i32, ptr %a, i32 %index 327 %tmp1 = icmp ult <4 x i32> %induction, %broadcast.splat11 328 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 329 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index 330 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 331 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 332 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index 333 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1) 334 %index.next = add i32 %index, 4 335 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 336 %tmp16 = icmp ne i32 %tmp15, 0 337 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 338 339for.cond.cleanup: ; preds = %vector.body, %entry 340 ret void 341} 342 343; The add in the body uses 1, 2, 3, 4 344define void @wrong_body_broadcast_splat(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { 345entry: 346 %cmp8 = icmp eq i32 %N, 0 347 %tmp8 = add i32 %N, 3 348 %tmp9 = lshr i32 %tmp8, 2 349 %tmp10 = shl nuw i32 %tmp9, 2 350 %tmp11 = add i32 %tmp10, -4 351 %tmp12 = lshr i32 %tmp11, 2 352 %tmp13 = add nuw nsw i32 %tmp12, 1 353 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 354 355vector.ph: ; preds = %entry 356 %trip.count.minus.1 = add i32 %N, -1 357 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 358 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 359 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 360 br label %vector.body 361 362vector.body: ; preds = %vector.body, %vector.ph 363 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 364 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 365 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 366 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 367 %induction = add <4 x i32> %broadcast.splat, <i32 1, i32 2, i32 3, i32 4> 368 %tmp = getelementptr inbounds i32, ptr %a, i32 %index 369 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 370 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 371 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index 372 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 373 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 374 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index 375 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1) 376 %index.next = add i32 %index, 4 377 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 378 %tmp16 = icmp ne i32 %tmp15, 0 379 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 380 381for.cond.cleanup: ; preds = %vector.body, %entry 382 ret void 383} 384 385; Using a variable for the loop body broadcast. 386define void @wrong_body_broadcast_splat_2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N, <4 x i32> %offsets) { 387entry: 388 %cmp8 = icmp eq i32 %N, 0 389 %tmp8 = add i32 %N, 3 390 %tmp9 = lshr i32 %tmp8, 2 391 %tmp10 = shl nuw i32 %tmp9, 2 392 %tmp11 = add i32 %tmp10, -4 393 %tmp12 = lshr i32 %tmp11, 2 394 %tmp13 = add nuw nsw i32 %tmp12, 1 395 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 396 397vector.ph: ; preds = %entry 398 %trip.count.minus.1 = add i32 %N, -1 399 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 400 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 401 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 402 br label %vector.body 403 404vector.body: ; preds = %vector.body, %vector.ph 405 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 406 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 407 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 408 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 409 %induction = add <4 x i32> %broadcast.splat, %offsets 410 %tmp = getelementptr inbounds i32, ptr %a, i32 %index 411 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 412 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 413 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index 414 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 415 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 416 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index 417 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1) 418 %index.next = add i32 %index, 4 419 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 420 %tmp16 = icmp ne i32 %tmp15, 0 421 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 422 423for.cond.cleanup: ; preds = %vector.body, %entry 424 ret void 425} 426 427; adding 5, instead of 4, to index. 428define void @wrong_index_add(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { 429entry: 430 %cmp8 = icmp eq i32 %N, 0 431 %tmp8 = add i32 %N, 3 432 %tmp9 = lshr i32 %tmp8, 2 433 %tmp10 = shl nuw i32 %tmp9, 2 434 %tmp11 = add i32 %tmp10, -4 435 %tmp12 = lshr i32 %tmp11, 2 436 %tmp13 = add nuw nsw i32 %tmp12, 1 437 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 438 439vector.ph: ; preds = %entry 440 %trip.count.minus.1 = add i32 %N, -1 441 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 442 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 443 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 444 br label %vector.body 445 446vector.body: ; preds = %vector.body, %vector.ph 447 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 448 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 449 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 450 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 451 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 452 %tmp = getelementptr inbounds i32, ptr %a, i32 %index 453 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 454 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 455 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index 456 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef) 457 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 458 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index 459 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1) 460 %index.next = add i32 %index, 5 461 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) 462 %tmp16 = icmp ne i32 %tmp15, 0 463 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 464 465for.cond.cleanup: ; preds = %vector.body, %entry 466 ret void 467} 468 469declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) #1 470declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #2 471declare i32 @llvm.start.loop.iterations.i32(i32) #3 472declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 473 474