1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -o - -S -passes=load-store-vectorizer,dce %s | FileCheck %s 3 4; Make sure LoadStoreVectorizer vectorizes the loads below. 5; In order to prove that the vectorization is safe, it tries to 6; match nested adds and find an expression that adds a constant 7; value to an existing index and the result doesn't overflow. 8 9target triple = "x86_64--" 10 11define void @ld_v4i8_add_nsw(i32 %v0, i32 %v1, ptr %src, ptr %dst) { 12; CHECK-LABEL: @ld_v4i8_add_nsw( 13; CHECK-NEXT: bb: 14; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1 15; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]] 16; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 17; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]] 18; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 19; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 20; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 21; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 22; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 23; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 24; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 25; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 26; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 27; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]] 28; CHECK-NEXT: ret void 29; 30bb: 31 %tmp = add nsw i32 %v0, -1 32 %tmp1 = add nsw i32 %v1, %tmp 33 %tmp2 = sext i32 %tmp1 to i64 34 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 35 %tmp4 = load i8, ptr %tmp3, align 1 36 %tmp5 = add nsw i32 %v1, %v0 37 %tmp6 = sext i32 %tmp5 to i64 38 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 39 %tmp8 = load i8, ptr %tmp7, align 1 40 %tmp9 = add nsw i32 %v0, 1 41 %tmp10 = add nsw i32 %v1, %tmp9 42 %tmp11 = sext i32 %tmp10 to i64 43 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11 44 %tmp13 = load i8, ptr %tmp12, align 1 45 %tmp14 = add nsw i32 %v0, 2 46 %tmp15 = add nsw i32 %v1, %tmp14 47 %tmp16 = sext i32 %tmp15 to i64 48 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16 49 %tmp18 = load i8, ptr %tmp17, align 1 50 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 51 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 52 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 53 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 54 store <4 x i8> %tmp22, ptr %dst 55 ret void 56} 57 58; Apply different operand orders for the nested add sequences 59define void @ld_v4i8_add_nsw_operand_orders(i32 %v0, i32 %v1, ptr %src, ptr %dst) { 60; CHECK-LABEL: @ld_v4i8_add_nsw_operand_orders( 61; CHECK-NEXT: bb: 62; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1 63; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]] 64; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 65; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]] 66; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 67; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 68; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 69; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 70; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 71; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 72; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 73; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 74; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 75; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]] 76; CHECK-NEXT: ret void 77; 78bb: 79 %tmp = add nsw i32 %v0, -1 80 %tmp1 = add nsw i32 %v1, %tmp 81 %tmp2 = sext i32 %tmp1 to i64 82 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 83 %tmp4 = load i8, ptr %tmp3, align 1 84 %tmp5 = add nsw i32 %v0, %v1 85 %tmp6 = sext i32 %tmp5 to i64 86 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 87 %tmp8 = load i8, ptr %tmp7, align 1 88 %tmp9 = add nsw i32 %v0, 1 89 %tmp10 = add nsw i32 %tmp9, %v1 90 %tmp11 = sext i32 %tmp10 to i64 91 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11 92 %tmp13 = load i8, ptr %tmp12, align 1 93 %tmp14 = add nsw i32 %v0, 2 94 %tmp15 = add nsw i32 %v1, %tmp14 95 %tmp16 = sext i32 %tmp15 to i64 96 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16 97 %tmp18 = load i8, ptr %tmp17, align 1 98 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 99 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 100 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 101 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 102 store <4 x i8> %tmp22, ptr %dst 103 ret void 104} 105 106define void @ld_v4i8_add_known_bits(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) { 107; CHECK-LABEL: @ld_v4i8_add_known_bits( 108; CHECK-NEXT: bb: 109; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 110; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 4 111; CHECK-NEXT: [[TMP:%.*]] = add i32 [[V0]], -1 112; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1]], [[TMP]] 113; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 114; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]] 115; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1 116; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 117; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 118; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]] 119; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i8>, ptr [[TMP7]], align 1 120; CHECK-NEXT: [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0 121; CHECK-NEXT: [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1 122; CHECK-NEXT: [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2 123; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0 124; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 125; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 126; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 127; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]] 128; CHECK-NEXT: ret void 129; 130bb: 131 %v0 = mul i32 %ind0, 4 132 %v1 = mul i32 %ind1, 4 133 %tmp = add i32 %v0, -1 134 %tmp1 = add i32 %v1, %tmp 135 %tmp2 = sext i32 %tmp1 to i64 136 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 137 %tmp4 = load i8, ptr %tmp3, align 1 138 %tmp5 = add i32 %v1, %v0 139 %tmp6 = sext i32 %tmp5 to i64 140 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 141 %tmp8 = load i8, ptr %tmp7, align 1 142 %tmp9 = add i32 %v0, 1 143 %tmp10 = add i32 %v1, %tmp9 144 %tmp11 = sext i32 %tmp10 to i64 145 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11 146 %tmp13 = load i8, ptr %tmp12, align 1 147 %tmp14 = add i32 %v0, 2 148 %tmp15 = add i32 %v1, %tmp14 149 %tmp16 = sext i32 %tmp15 to i64 150 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16 151 %tmp18 = load i8, ptr %tmp17, align 1 152 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 153 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 154 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 155 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 156 store <4 x i8> %tmp22, ptr %dst 157 ret void 158} 159 160define void @ld_v4i8_add_known_bits1(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) { 161; CHECK-LABEL: @ld_v4i8_add_known_bits1( 162; CHECK-NEXT: bb: 163; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 164; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 4 165; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 166; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 167; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]] 168; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 169; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 170; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 171; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 172; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 173; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0 174; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 175; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 176; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 177; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]] 178; CHECK-NEXT: ret void 179; 180bb: 181 %v0 = mul i32 %ind0, 4 182 %v1 = mul i32 %ind1, 4 183 %tmp = add i32 %v0, 3 184 %tmp1 = add i32 %v1, %tmp 185 %tmp2 = sext i32 %tmp1 to i64 186 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 187 %tmp4 = load i8, ptr %tmp3, align 1 188 %tmp5 = add i32 %v1, %v0 189 %tmp6 = sext i32 %tmp5 to i64 190 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 191 %tmp8 = load i8, ptr %tmp7, align 1 192 %tmp9 = add i32 %v0, 1 193 %tmp10 = add i32 %v1, %tmp9 194 %tmp11 = sext i32 %tmp10 to i64 195 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11 196 %tmp13 = load i8, ptr %tmp12, align 1 197 %tmp14 = add i32 %v0, 2 198 %tmp15 = add i32 %v1, %tmp14 199 %tmp16 = sext i32 %tmp15 to i64 200 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16 201 %tmp18 = load i8, ptr %tmp17, align 1 202 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 203 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 204 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 205 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 206 store <4 x i8> %tmp22, ptr %dst 207 ret void 208} 209 210define void @ld_v4i8_add_known_bits_by_assume(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) { 211; CHECK-LABEL: @ld_v4i8_add_known_bits_by_assume( 212; CHECK-NEXT: bb: 213; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 3 214; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 215; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[V0]], 3 216; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0 217; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[V1]], 3 218; CHECK-NEXT: [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0 219; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I]]) 220; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I_1]]) 221; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 222; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 223; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]] 224; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 225; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 226; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 227; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 228; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 229; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0 230; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 231; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 232; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 233; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]] 234; CHECK-NEXT: ret void 235; 236bb: 237 %v0 = mul i32 %ind0, 3 238 %v1 = mul i32 %ind1, 3 239 %and.i = and i32 %v0, 3 240 %cmp.i = icmp eq i32 %and.i, 0 241 %and.i.1 = and i32 %v1, 3 242 %cmp.i.1 = icmp eq i32 %and.i.1, 0 243 call void @llvm.assume(i1 %cmp.i) 244 call void @llvm.assume(i1 %cmp.i.1) 245 %tmp = add i32 %v0, 3 246 %tmp1 = add i32 %v1, %tmp 247 %tmp2 = sext i32 %tmp1 to i64 248 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 249 %tmp4 = load i8, ptr %tmp3, align 1 250 %tmp5 = add i32 %v1, %v0 251 %tmp6 = sext i32 %tmp5 to i64 252 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 253 %tmp8 = load i8, ptr %tmp7, align 1 254 %tmp9 = add i32 %v0, 1 255 %tmp10 = add i32 %v1, %tmp9 256 %tmp11 = sext i32 %tmp10 to i64 257 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11 258 %tmp13 = load i8, ptr %tmp12, align 1 259 %tmp14 = add i32 %v0, 2 260 %tmp15 = add i32 %v1, %tmp14 261 %tmp16 = sext i32 %tmp15 to i64 262 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16 263 %tmp18 = load i8, ptr %tmp17, align 1 264 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 265 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 266 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 267 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 268 store <4 x i8> %tmp22, ptr %dst 269 ret void 270} 271 272declare void @llvm.assume(i1) 273 274define void @ld_v4i8_add_assume_on_arg(i32 %v0, i32 %v1, ptr %src, ptr %dst) { 275; CHECK-LABEL: @ld_v4i8_add_assume_on_arg( 276; CHECK-NEXT: bb: 277; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[V0:%.*]], 3 278; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0 279; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[V1:%.*]], 3 280; CHECK-NEXT: [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0 281; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I]]) 282; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I_1]]) 283; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0]], -1 284; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1]], [[TMP]] 285; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 286; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]] 287; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1 288; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 289; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 290; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]] 291; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i8>, ptr [[TMP7]], align 1 292; CHECK-NEXT: [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0 293; CHECK-NEXT: [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1 294; CHECK-NEXT: [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2 295; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0 296; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 297; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 298; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 299; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]] 300; CHECK-NEXT: ret void 301; 302bb: 303 %and.i = and i32 %v0, 3 304 %cmp.i = icmp eq i32 %and.i, 0 305 %and.i.1 = and i32 %v1, 3 306 %cmp.i.1 = icmp eq i32 %and.i.1, 0 307 call void @llvm.assume(i1 %cmp.i) 308 call void @llvm.assume(i1 %cmp.i.1) 309 %tmp = add nsw i32 %v0, -1 310 %tmp1 = add i32 %v1, %tmp 311 %tmp2 = sext i32 %tmp1 to i64 312 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 313 %tmp4 = load i8, ptr %tmp3, align 1 314 %tmp5 = add i32 %v1, %v0 315 %tmp6 = sext i32 %tmp5 to i64 316 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 317 %tmp8 = load i8, ptr %tmp7, align 1 318 %tmp9 = add nsw i32 %v0, 1 319 %tmp10 = add i32 %v1, %tmp9 320 %tmp11 = sext i32 %tmp10 to i64 321 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11 322 %tmp13 = load i8, ptr %tmp12, align 1 323 %tmp14 = add nsw i32 %v0, 2 324 %tmp15 = add i32 %v1, %tmp14 325 %tmp16 = sext i32 %tmp15 to i64 326 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16 327 %tmp18 = load i8, ptr %tmp17, align 1 328 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 329 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 330 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 331 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 332 store <4 x i8> %tmp22, ptr %dst 333 ret void 334} 335 336define void @ld_v4i8_add_assume_on_arg1(i32 %v0, i32 %v1, ptr %src, ptr %dst) { 337; CHECK-LABEL: @ld_v4i8_add_assume_on_arg1( 338; CHECK-NEXT: bb: 339; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[V0:%.*]], 3 340; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0 341; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[V1:%.*]], 3 342; CHECK-NEXT: [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0 343; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I]]) 344; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I_1]]) 345; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 346; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 347; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]] 348; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 349; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 350; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 351; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 352; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 353; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0 354; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 355; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 356; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 357; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]] 358; CHECK-NEXT: ret void 359; 360bb: 361 %and.i = and i32 %v0, 3 362 %cmp.i = icmp eq i32 %and.i, 0 363 %and.i.1 = and i32 %v1, 3 364 %cmp.i.1 = icmp eq i32 %and.i.1, 0 365 call void @llvm.assume(i1 %cmp.i) 366 call void @llvm.assume(i1 %cmp.i.1) 367 %tmp = add nsw i32 %v0, 3 368 %tmp1 = add i32 %v1, %tmp 369 %tmp2 = sext i32 %tmp1 to i64 370 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 371 %tmp4 = load i8, ptr %tmp3, align 1 372 %tmp5 = add i32 %v1, %v0 373 %tmp6 = sext i32 %tmp5 to i64 374 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 375 %tmp8 = load i8, ptr %tmp7, align 1 376 %tmp9 = add nsw i32 %v0, 1 377 %tmp10 = add i32 %v1, %tmp9 378 %tmp11 = sext i32 %tmp10 to i64 379 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11 380 %tmp13 = load i8, ptr %tmp12, align 1 381 %tmp14 = add nsw i32 %v0, 2 382 %tmp15 = add i32 %v1, %tmp14 383 %tmp16 = sext i32 %tmp15 to i64 384 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16 385 %tmp18 = load i8, ptr %tmp17, align 1 386 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 387 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 388 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 389 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 390 store <4 x i8> %tmp22, ptr %dst 391 ret void 392} 393 394; Address computations are partly separated by control flow and with llvm.assume placed 395; in the second basic block 396 397define void @ld_v2i8_add_different_contexts(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) { 398; CHECK-LABEL: @ld_v2i8_add_different_contexts( 399; CHECK-NEXT: bb: 400; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 401; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 402; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 403; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[V1]], 0 404; CHECK-NEXT: br i1 [[BIT_COND]], label [[BB_LOADS:%.*]], label [[BB_SKIP:%.*]] 405; CHECK: bb.loads: 406; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]]) 407; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 408; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]] 409; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1 410; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0 411; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 412; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0 413; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1 414; CHECK-NEXT: store <2 x i8> [[TMP20]], ptr [[DST:%.*]] 415; CHECK-NEXT: br label [[BB_SKIP]] 416; CHECK: bb.skip: 417; CHECK-NEXT: ret void 418; 419bb: 420 %v0 = mul i32 %ind0, 4 421 %v1 = mul i32 %ind1, 3 422 %tmp5 = add i32 %v1, %v0 423 %bit_cond = icmp eq i32 %v1, 0 424 br i1 %bit_cond, label %bb.loads, label %bb.skip 425 426bb.loads: 427 call void @llvm.assume(i1 %bit_cond) 428 %tmp = add nsw i32 %v0, 1 429 %tmp1 = add i32 %v1, %tmp 430 %tmp2 = sext i32 %tmp1 to i64 431 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 432 %tmp4 = load i8, ptr %tmp3, align 1 433 %tmp6 = sext i32 %tmp5 to i64 434 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 435 %tmp8 = load i8, ptr %tmp7, align 1 436 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0 437 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1 438 store <2 x i8> %tmp20, ptr %dst 439 br label %bb.skip 440 441bb.skip: 442 ret void 443} 444 445; Same as ld_v2i8_add_different_contexts but with llvm.assume placed between loads 446 447define void @ld_v2i8_add_different_contexts1(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) { 448; CHECK-LABEL: @ld_v2i8_add_different_contexts1( 449; CHECK-NEXT: bb: 450; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 451; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 452; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 453; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[V1]], 0 454; CHECK-NEXT: br i1 [[BIT_COND]], label [[BB_LOADS:%.*]], label [[BB_SKIP:%.*]] 455; CHECK: bb.loads: 456; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 457; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]] 458; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1 459; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0 460; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 461; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]]) 462; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0 463; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1 464; CHECK-NEXT: store <2 x i8> [[TMP20]], ptr [[DST:%.*]] 465; CHECK-NEXT: br label [[BB_SKIP]] 466; CHECK: bb.skip: 467; CHECK-NEXT: ret void 468; 469bb: 470 %v0 = mul i32 %ind0, 4 471 %v1 = mul i32 %ind1, 3 472 %tmp5 = add i32 %v1, %v0 473 %bit_cond = icmp eq i32 %v1, 0 474 br i1 %bit_cond, label %bb.loads, label %bb.skip 475 476bb.loads: 477 %tmp6 = sext i32 %tmp5 to i64 478 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 479 %tmp8 = load i8, ptr %tmp7, align 1 480 call void @llvm.assume(i1 %bit_cond) 481 %tmp = add nsw i32 %v0, 1 482 %tmp1 = add i32 %v1, %tmp 483 %tmp2 = sext i32 %tmp1 to i64 484 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 485 %tmp4 = load i8, ptr %tmp3, align 1 486 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0 487 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1 488 store <2 x i8> %tmp20, ptr %dst 489 br label %bb.skip 490 491bb.skip: 492 ret void 493} 494 495; llvm.assume is placed between loads in a single basic block 496 497define void @ld_v2i8_add_context(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) { 498; CHECK-LABEL: @ld_v2i8_add_context( 499; CHECK-NEXT: bb: 500; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 501; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 502; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 503; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 504; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]] 505; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1 506; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0 507; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 508; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[TMP5]], 0 509; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]]) 510; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0 511; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1 512; CHECK-NEXT: store <2 x i8> [[TMP20]], ptr [[DST:%.*]] 513; CHECK-NEXT: ret void 514; 515bb: 516 %v0 = mul i32 %ind0, 4 517 %v1 = mul i32 %ind1, 3 518 %tmp5 = add i32 %v1, %v0 519 %tmp6 = sext i32 %tmp5 to i64 520 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 521 %tmp8 = load i8, ptr %tmp7, align 1 522 %bit_cond = icmp eq i32 %tmp5, 0 523 call void @llvm.assume(i1 %bit_cond) 524 %tmp = add nsw i32 %v0, 1 525 %tmp1 = add i32 %v1, %tmp 526 %tmp2 = sext i32 %tmp1 to i64 527 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 528 %tmp4 = load i8, ptr %tmp3, align 1 529 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0 530 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1 531 store <2 x i8> %tmp20, ptr %dst 532 ret void 533} 534 535; Placing llvm.assume after all the loads and stores in the basic block still works 536 537define void @ld_v2i8_add_context1(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) { 538; CHECK-LABEL: @ld_v2i8_add_context1( 539; CHECK-NEXT: bb: 540; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 541; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 542; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 543; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 544; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]] 545; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1 546; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0 547; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 548; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0 549; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1 550; CHECK-NEXT: store <2 x i8> [[TMP20]], ptr [[DST:%.*]] 551; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[TMP5]], 0 552; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]]) 553; CHECK-NEXT: ret void 554; 555bb: 556 %v0 = mul i32 %ind0, 4 557 %v1 = mul i32 %ind1, 3 558 %tmp5 = add i32 %v1, %v0 559 %tmp6 = sext i32 %tmp5 to i64 560 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 561 %tmp8 = load i8, ptr %tmp7, align 1 562 %tmp = add nsw i32 %v0, 1 563 %tmp1 = add i32 %v1, %tmp 564 %tmp2 = sext i32 %tmp1 to i64 565 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 566 %tmp4 = load i8, ptr %tmp3, align 1 567 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0 568 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1 569 store <2 x i8> %tmp20, ptr %dst 570 %bit_cond = icmp eq i32 %tmp5, 0 571 call void @llvm.assume(i1 %bit_cond) 572 ret void 573} 574 575; Make sure we don't vectorize the loads below because the source of 576; sext instructions doesn't have the nsw flag or known bits allowing 577; to apply the vectorization. 578 579define void @ld_v4i8_add_not_safe(i32 %v0, i32 %v1, ptr %src, ptr %dst) { 580; CHECK-LABEL: @ld_v4i8_add_not_safe( 581; CHECK-NEXT: bb: 582; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1 583; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1:%.*]], [[TMP]] 584; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 585; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]] 586; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1 587; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 588; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 589; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]] 590; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[TMP7]], align 1 591; CHECK-NEXT: [[TMP9:%.*]] = add nsw i32 [[V0]], 1 592; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[V1]], [[TMP9]] 593; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 594; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP11]] 595; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1 596; CHECK-NEXT: [[TMP14:%.*]] = add nsw i32 [[V0]], 2 597; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[V1]], [[TMP14]] 598; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 599; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP16]] 600; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP17]], align 1 601; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0 602; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP8]], i32 1 603; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP13]], i32 2 604; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP18]], i32 3 605; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]] 606; CHECK-NEXT: ret void 607; 608bb: 609 %tmp = add nsw i32 %v0, -1 610 %tmp1 = add i32 %v1, %tmp 611 %tmp2 = sext i32 %tmp1 to i64 612 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2 613 %tmp4 = load i8, ptr %tmp3, align 1 614 %tmp5 = add i32 %v1, %v0 615 %tmp6 = sext i32 %tmp5 to i64 616 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6 617 %tmp8 = load i8, ptr %tmp7, align 1 618 %tmp9 = add nsw i32 %v0, 1 619 %tmp10 = add i32 %v1, %tmp9 620 %tmp11 = sext i32 %tmp10 to i64 621 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11 622 %tmp13 = load i8, ptr %tmp12, align 1 623 %tmp14 = add nsw i32 %v0, 2 624 %tmp15 = add i32 %v1, %tmp14 625 %tmp16 = sext i32 %tmp15 to i64 626 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16 627 %tmp18 = load i8, ptr %tmp17, align 1 628 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 629 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 630 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 631 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 632 store <4 x i8> %tmp22, ptr %dst 633 ret void 634} 635