1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 3; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 4 5target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 6 7define float @matching_fp_scalar(ptr align 16 dereferenceable(16) %p) { 8; CHECK-LABEL: @matching_fp_scalar( 9; CHECK-NEXT: [[R:%.*]] = load float, ptr [[P:%.*]], align 16 10; CHECK-NEXT: ret float [[R]] 11; 12 %r = load float, ptr %p, align 16 13 ret float %r 14} 15 16define float @matching_fp_scalar_volatile(ptr align 16 dereferenceable(16) %p) { 17; CHECK-LABEL: @matching_fp_scalar_volatile( 18; CHECK-NEXT: [[R:%.*]] = load volatile float, ptr [[P:%.*]], align 16 19; CHECK-NEXT: ret float [[R]] 20; 21 %r = load volatile float, ptr %p, align 16 22 ret float %r 23} 24 25define double @larger_fp_scalar(ptr align 16 dereferenceable(16) %p) { 26; CHECK-LABEL: @larger_fp_scalar( 27; CHECK-NEXT: [[R:%.*]] = load double, ptr [[P:%.*]], align 16 28; CHECK-NEXT: ret double [[R]] 29; 30 %r = load double, ptr %p, align 16 31 ret double %r 32} 33 34define float @smaller_fp_scalar(ptr align 16 dereferenceable(16) %p) { 35; CHECK-LABEL: @smaller_fp_scalar( 36; CHECK-NEXT: [[R:%.*]] = load float, ptr [[P:%.*]], align 16 37; CHECK-NEXT: ret float [[R]] 38; 39 %r = load float, ptr %p, align 16 40 ret float %r 41} 42 43define float @matching_fp_vector(ptr align 16 dereferenceable(16) %p) { 44; CHECK-LABEL: @matching_fp_vector( 45; CHECK-NEXT: [[R:%.*]] = load float, ptr [[P:%.*]], align 16 46; CHECK-NEXT: ret float [[R]] 47; 48 %r = load float, ptr %p, align 16 49 ret float %r 50} 51 52define float @matching_fp_vector_gep00(ptr align 16 dereferenceable(16) %p) { 53; CHECK-LABEL: @matching_fp_vector_gep00( 54; CHECK-NEXT: [[R:%.*]] = load float, ptr [[P:%.*]], align 16 55; CHECK-NEXT: ret float [[R]] 56; 57 %r = load float, ptr %p, align 16 58 ret float %r 59} 60 61define float @matching_fp_vector_gep01(ptr align 16 dereferenceable(20) %p) { 62; CHECK-LABEL: @matching_fp_vector_gep01( 63; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[P:%.*]], i64 0, i64 1 64; CHECK-NEXT: [[R:%.*]] = load float, ptr [[GEP]], align 4 65; CHECK-NEXT: ret float [[R]] 66; 67 %gep = getelementptr inbounds <4 x float>, ptr %p, i64 0, i64 1 68 %r = load float, ptr %gep, align 4 69 ret float %r 70} 71 72define float @matching_fp_vector_gep01_deref(ptr align 16 dereferenceable(19) %p) { 73; CHECK-LABEL: @matching_fp_vector_gep01_deref( 74; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[P:%.*]], i64 0, i64 1 75; CHECK-NEXT: [[R:%.*]] = load float, ptr [[GEP]], align 4 76; CHECK-NEXT: ret float [[R]] 77; 78 %gep = getelementptr inbounds <4 x float>, ptr %p, i64 0, i64 1 79 %r = load float, ptr %gep, align 4 80 ret float %r 81} 82 83define float @matching_fp_vector_gep10(ptr align 16 dereferenceable(32) %p) { 84; CHECK-LABEL: @matching_fp_vector_gep10( 85; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[P:%.*]], i64 1, i64 0 86; CHECK-NEXT: [[R:%.*]] = load float, ptr [[GEP]], align 16 87; CHECK-NEXT: ret float [[R]] 88; 89 %gep = getelementptr inbounds <4 x float>, ptr %p, i64 1, i64 0 90 %r = load float, ptr %gep, align 16 91 ret float %r 92} 93 94define float @matching_fp_vector_gep10_deref(ptr align 16 dereferenceable(31) %p) { 95; CHECK-LABEL: @matching_fp_vector_gep10_deref( 96; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[P:%.*]], i64 1, i64 0 97; CHECK-NEXT: [[R:%.*]] = load float, ptr [[GEP]], align 16 98; CHECK-NEXT: ret float [[R]] 99; 100 %gep = getelementptr inbounds <4 x float>, ptr %p, i64 1, i64 0 101 %r = load float, ptr %gep, align 16 102 ret float %r 103} 104 105define float @nonmatching_int_vector(ptr align 16 dereferenceable(16) %p) { 106; CHECK-LABEL: @nonmatching_int_vector( 107; CHECK-NEXT: [[R:%.*]] = load float, ptr [[P:%.*]], align 16 108; CHECK-NEXT: ret float [[R]] 109; 110 %r = load float, ptr %p, align 16 111 ret float %r 112} 113 114define double @less_aligned(ptr align 4 dereferenceable(16) %p) { 115; CHECK-LABEL: @less_aligned( 116; CHECK-NEXT: [[R:%.*]] = load double, ptr [[P:%.*]], align 4 117; CHECK-NEXT: ret double [[R]] 118; 119 %r = load double, ptr %p, align 4 120 ret double %r 121} 122 123define float @matching_fp_scalar_small_deref(ptr align 16 dereferenceable(15) %p) { 124; CHECK-LABEL: @matching_fp_scalar_small_deref( 125; CHECK-NEXT: [[R:%.*]] = load float, ptr [[P:%.*]], align 16 126; CHECK-NEXT: ret float [[R]] 127; 128 %r = load float, ptr %p, align 16 129 ret float %r 130} 131 132define i64 @larger_int_scalar(ptr align 16 dereferenceable(16) %p) { 133; CHECK-LABEL: @larger_int_scalar( 134; CHECK-NEXT: [[R:%.*]] = load i64, ptr [[P:%.*]], align 16 135; CHECK-NEXT: ret i64 [[R]] 136; 137 %r = load i64, ptr %p, align 16 138 ret i64 %r 139} 140 141define i8 @smaller_int_scalar(ptr align 16 dereferenceable(16) %p) { 142; CHECK-LABEL: @smaller_int_scalar( 143; CHECK-NEXT: [[R:%.*]] = load i8, ptr [[P:%.*]], align 16 144; CHECK-NEXT: ret i8 [[R]] 145; 146 %r = load i8, ptr %p, align 16 147 ret i8 %r 148} 149 150define double @larger_fp_scalar_256bit_vec(ptr align 32 dereferenceable(32) %p) { 151; CHECK-LABEL: @larger_fp_scalar_256bit_vec( 152; CHECK-NEXT: [[R:%.*]] = load double, ptr [[P:%.*]], align 32 153; CHECK-NEXT: ret double [[R]] 154; 155 %r = load double, ptr %p, align 32 156 ret double %r 157} 158 159define <4 x float> @load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync { 160; CHECK-LABEL: @load_f32_insert_v4f32( 161; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4 162; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 163; CHECK-NEXT: ret <4 x float> [[R]] 164; 165 %s = load float, ptr %p, align 4 166 %r = insertelement <4 x float> undef, float %s, i32 0 167 ret <4 x float> %r 168} 169 170define <4 x float> @casted_load_f32_insert_v4f32(ptr align 4 dereferenceable(16) %p) nofree nosync { 171; CHECK-LABEL: @casted_load_f32_insert_v4f32( 172; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4 173; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 174; CHECK-NEXT: ret <4 x float> [[R]] 175; 176 %s = load float, ptr %p, align 4 177 %r = insertelement <4 x float> undef, float %s, i32 0 178 ret <4 x float> %r 179} 180 181; Element type does not change cost. 182 183define <4 x i32> @load_i32_insert_v4i32(ptr align 16 dereferenceable(16) %p) nofree nosync { 184; CHECK-LABEL: @load_i32_insert_v4i32( 185; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[P:%.*]], align 4 186; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0 187; CHECK-NEXT: ret <4 x i32> [[R]] 188; 189 %s = load i32, ptr %p, align 4 190 %r = insertelement <4 x i32> undef, i32 %s, i32 0 191 ret <4 x i32> %r 192} 193 194; Pointer type does not change cost. 195 196define <4 x i32> @casted_load_i32_insert_v4i32(ptr align 4 dereferenceable(16) %p) nofree nosync { 197; CHECK-LABEL: @casted_load_i32_insert_v4i32( 198; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[P:%.*]], align 4 199; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0 200; CHECK-NEXT: ret <4 x i32> [[R]] 201; 202 %s = load i32, ptr %p, align 4 203 %r = insertelement <4 x i32> undef, i32 %s, i32 0 204 ret <4 x i32> %r 205} 206 207; This is canonical form for vector element access. 208 209define <4 x float> @gep00_load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync { 210; CHECK-LABEL: @gep00_load_f32_insert_v4f32( 211; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 16 212; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0 213; CHECK-NEXT: ret <4 x float> [[R]] 214; 215 %s = load float, ptr %p, align 16 216 %r = insertelement <4 x float> undef, float %s, i64 0 217 ret <4 x float> %r 218} 219 220; Should work with addrspace as well. 221 222define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(ptr addrspace(44) align 16 dereferenceable(16) %p) nofree nosync { 223; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace( 224; CHECK-NEXT: [[S:%.*]] = load float, ptr addrspace(44) [[P:%.*]], align 16 225; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0 226; CHECK-NEXT: ret <4 x float> [[R]] 227; 228 %s = load float, ptr addrspace(44) %p, align 16 229 %r = insertelement <4 x float> undef, float %s, i64 0 230 ret <4 x float> %r 231} 232 233; If there are enough dereferenceable bytes, we can offset the vector load. 234 235define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync { 236; CHECK-LABEL: @gep01_load_i16_insert_v8i16( 237; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 238; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 2 239; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 240; CHECK-NEXT: ret <8 x i16> [[R]] 241; 242 %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 243 %s = load i16, ptr %gep, align 2 244 %r = insertelement <8 x i16> undef, i16 %s, i64 0 245 ret <8 x i16> %r 246} 247 248; Can't safely load the offset vector, but can load+shuffle if it is profitable. 249 250define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable(17) %p) nofree nosync { 251; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref( 252; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 253; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 2 254; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 255; CHECK-NEXT: ret <8 x i16> [[R]] 256; 257 %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 258 %s = load i16, ptr %gep, align 2 259 %r = insertelement <8 x i16> undef, i16 %s, i64 0 260 ret <8 x i16> %r 261} 262 263; Verify that alignment of the new load is not over-specified. 264 265define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 dereferenceable(16) %p) nofree nosync { 266; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( 267; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 268; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 8 269; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 270; CHECK-NEXT: ret <8 x i16> [[R]] 271; 272 %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 273 %s = load i16, ptr %gep, align 8 274 %r = insertelement <8 x i16> undef, i16 %s, i64 0 275 ret <8 x i16> %r 276} 277 278; Negative test - if we are shuffling a load from the base pointer, the address offset 279; must be a multiple of element size. 280; TODO: Could bitcast around this limitation. 281 282define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(16) %p) { 283; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32( 284; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1 285; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 286; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0 287; CHECK-NEXT: ret <4 x i32> [[R]] 288; 289 %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1 290 %s = load i32, ptr %gep, align 1 291 %r = insertelement <4 x i32> undef, i32 %s, i64 0 292 ret <4 x i32> %r 293} 294 295define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync { 296; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32( 297; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 12 298; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 299; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0 300; CHECK-NEXT: ret <4 x i32> [[R]] 301; 302 %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 12 303 %s = load i32, ptr %gep, align 1 304 %r = insertelement <4 x i32> undef, i32 %s, i64 0 305 ret <4 x i32> %r 306} 307 308; Negative test - if we are shuffling a load from the base pointer, the address offset 309; must be a multiple of element size and the offset must be low enough to fit in the vector 310; (bitcasting would not help this case). 311 312define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync { 313; CHECK-LABEL: @gep013_bitcast_load_i32_insert_v4i32( 314; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 13 315; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 316; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0 317; CHECK-NEXT: ret <4 x i32> [[R]] 318; 319 %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 13 320 %s = load i32, ptr %gep, align 1 321 %r = insertelement <4 x i32> undef, i32 %s, i64 0 322 ret <4 x i32> %r 323} 324 325; If there are enough dereferenceable bytes, we can offset the vector load. 326 327define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync { 328; CHECK-LABEL: @gep10_load_i16_insert_v8i16( 329; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0 330; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 16 331; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 332; CHECK-NEXT: ret <8 x i16> [[R]] 333; 334 %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0 335 %s = load i16, ptr %gep, align 16 336 %r = insertelement <8 x i16> undef, i16 %s, i64 0 337 ret <8 x i16> %r 338} 339 340; Negative test - disable under asan because widened load can cause spurious 341; use-after-poison issues when __asan_poison_memory_region is used. 342 343define <8 x i16> @gep10_load_i16_insert_v8i16_asan(ptr align 16 dereferenceable(32) %p) sanitize_address nofree nosync { 344; CHECK-LABEL: @gep10_load_i16_insert_v8i16_asan( 345; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0 346; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 16 347; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 348; CHECK-NEXT: ret <8 x i16> [[R]] 349; 350 %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0 351 %s = load i16, ptr %gep, align 16 352 %r = insertelement <8 x i16> undef, i16 %s, i64 0 353 ret <8 x i16> %r 354} 355 356; hwasan and memtag should be similarly suppressed. 357 358define <8 x i16> @gep10_load_i16_insert_v8i16_hwasan(ptr align 16 dereferenceable(32) %p) sanitize_hwaddress nofree nosync { 359; CHECK-LABEL: @gep10_load_i16_insert_v8i16_hwasan( 360; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0 361; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 16 362; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 363; CHECK-NEXT: ret <8 x i16> [[R]] 364; 365 %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0 366 %s = load i16, ptr %gep, align 16 367 %r = insertelement <8 x i16> undef, i16 %s, i64 0 368 ret <8 x i16> %r 369} 370 371define <8 x i16> @gep10_load_i16_insert_v8i16_memtag(ptr align 16 dereferenceable(32) %p) sanitize_memtag nofree nosync { 372; CHECK-LABEL: @gep10_load_i16_insert_v8i16_memtag( 373; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0 374; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 16 375; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 376; CHECK-NEXT: ret <8 x i16> [[R]] 377; 378 %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0 379 %s = load i16, ptr %gep, align 16 380 %r = insertelement <8 x i16> undef, i16 %s, i64 0 381 ret <8 x i16> %r 382} 383 384; Negative test - disable under tsan because widened load may overlap bytes 385; being concurrently modified. tsan does not know that some bytes are undef. 386 387define <8 x i16> @gep10_load_i16_insert_v8i16_tsan(ptr align 16 dereferenceable(32) %p) sanitize_thread nofree nosync { 388; CHECK-LABEL: @gep10_load_i16_insert_v8i16_tsan( 389; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0 390; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 16 391; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 392; CHECK-NEXT: ret <8 x i16> [[R]] 393; 394 %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0 395 %s = load i16, ptr %gep, align 16 396 %r = insertelement <8 x i16> undef, i16 %s, i64 0 397 ret <8 x i16> %r 398} 399 400; Negative test - can't safely load the offset vector, but could load+shuffle. 401 402define <8 x i16> @gep10_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable(31) %p) nofree nosync { 403; CHECK-LABEL: @gep10_load_i16_insert_v8i16_deref( 404; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0 405; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 16 406; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 407; CHECK-NEXT: ret <8 x i16> [[R]] 408; 409 %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0 410 %s = load i16, ptr %gep, align 16 411 %r = insertelement <8 x i16> undef, i16 %s, i64 0 412 ret <8 x i16> %r 413} 414 415; Negative test - do not alter volatile. 416 417define <4 x float> @load_f32_insert_v4f32_volatile(ptr align 16 dereferenceable(16) %p) nofree nosync { 418; CHECK-LABEL: @load_f32_insert_v4f32_volatile( 419; CHECK-NEXT: [[S:%.*]] = load volatile float, ptr [[P:%.*]], align 4 420; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 421; CHECK-NEXT: ret <4 x float> [[R]] 422; 423 %s = load volatile float, ptr %p, align 4 424 %r = insertelement <4 x float> undef, float %s, i32 0 425 ret <4 x float> %r 426} 427 428; Pointer is not as aligned as load, but that's ok. 429; The new load uses the larger alignment value. 430 431define <4 x float> @load_f32_insert_v4f32_align(ptr align 1 dereferenceable(16) %p) nofree nosync { 432; CHECK-LABEL: @load_f32_insert_v4f32_align( 433; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4 434; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 435; CHECK-NEXT: ret <4 x float> [[R]] 436; 437 %s = load float, ptr %p, align 4 438 %r = insertelement <4 x float> undef, float %s, i32 0 439 ret <4 x float> %r 440} 441 442; Negative test - not enough bytes. 443 444define <4 x float> @load_f32_insert_v4f32_deref(ptr align 4 dereferenceable(15) %p) nofree nosync { 445; CHECK-LABEL: @load_f32_insert_v4f32_deref( 446; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4 447; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 448; CHECK-NEXT: ret <4 x float> [[R]] 449; 450 %s = load float, ptr %p, align 4 451 %r = insertelement <4 x float> undef, float %s, i32 0 452 ret <4 x float> %r 453} 454 455define <8 x i32> @load_i32_insert_v8i32(ptr align 16 dereferenceable(16) %p) nofree nosync { 456; CHECK-LABEL: @load_i32_insert_v8i32( 457; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[P:%.*]], align 4 458; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0 459; CHECK-NEXT: ret <8 x i32> [[R]] 460; 461 %s = load i32, ptr %p, align 4 462 %r = insertelement <8 x i32> undef, i32 %s, i32 0 463 ret <8 x i32> %r 464} 465 466define <8 x i32> @casted_load_i32_insert_v8i32(ptr align 4 dereferenceable(16) %p) nofree nosync { 467; CHECK-LABEL: @casted_load_i32_insert_v8i32( 468; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[P:%.*]], align 4 469; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0 470; CHECK-NEXT: ret <8 x i32> [[R]] 471; 472 %s = load i32, ptr %p, align 4 473 %r = insertelement <8 x i32> undef, i32 %s, i32 0 474 ret <8 x i32> %r 475} 476 477define <16 x float> @load_f32_insert_v16f32(ptr align 16 dereferenceable(16) %p) nofree nosync { 478; CHECK-LABEL: @load_f32_insert_v16f32( 479; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4 480; CHECK-NEXT: [[R:%.*]] = insertelement <16 x float> undef, float [[S]], i32 0 481; CHECK-NEXT: ret <16 x float> [[R]] 482; 483 %s = load float, ptr %p, align 4 484 %r = insertelement <16 x float> undef, float %s, i32 0 485 ret <16 x float> %r 486} 487 488define <2 x float> @load_f32_insert_v2f32(ptr align 16 dereferenceable(16) %p) nofree nosync { 489; CHECK-LABEL: @load_f32_insert_v2f32( 490; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4 491; CHECK-NEXT: [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0 492; CHECK-NEXT: ret <2 x float> [[R]] 493; 494 %s = load float, ptr %p, align 4 495 %r = insertelement <2 x float> undef, float %s, i32 0 496 ret <2 x float> %r 497} 498 499; Negative test - suppress load widening for asan/hwasan/memtag/tsan. 500 501define <2 x float> @load_f32_insert_v2f32_asan(ptr align 16 dereferenceable(16) %p) sanitize_address { 502; CHECK-LABEL: @load_f32_insert_v2f32_asan( 503; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4 504; CHECK-NEXT: [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0 505; CHECK-NEXT: ret <2 x float> [[R]] 506; 507 %s = load float, ptr %p, align 4 508 %r = insertelement <2 x float> undef, float %s, i32 0 509 ret <2 x float> %r 510} 511 512declare ptr @getscaleptr() 513define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr nocapture nonnull readonly %opptr) { 514; CHECK-LABEL: @PR47558_multiple_use_load( 515; CHECK-NEXT: [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr() 516; CHECK-NEXT: [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4 517; CHECK-NEXT: [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16 518; CHECK-NEXT: [[T1:%.*]] = insertelement <2 x float> undef, float [[SCALE]], i32 0 519; CHECK-NEXT: [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1 520; CHECK-NEXT: [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]] 521; CHECK-NEXT: [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0 522; CHECK-NEXT: [[RESULT0:%.*]] = insertelement <2 x float> undef, float [[T4]], i32 0 523; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> <i32 0, i32 3> 524; CHECK-NEXT: store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8 525; CHECK-NEXT: ret void 526; 527 %scaleptr = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr() 528 %op = load <2 x float>, ptr %opptr, align 4 529 %scale = load float, ptr %scaleptr, align 16 530 %t1 = insertelement <2 x float> undef, float %scale, i32 0 531 %t2 = insertelement <2 x float> %t1, float %scale, i32 1 532 %t3 = fmul <2 x float> %op, %t2 533 %t4 = extractelement <2 x float> %t3, i32 0 534 %result0 = insertelement <2 x float> undef, float %t4, i32 0 535 %t5 = extractelement <2 x float> %t3, i32 1 536 %result1 = insertelement <2 x float> %result0, float %t5, i32 1 537 store <2 x float> %result1, ptr %resultptr, align 8 538 ret void 539} 540 541define <4 x float> @load_v2f32_extract_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync { 542; CHECK-LABEL: @load_v2f32_extract_insert_v4f32( 543; CHECK-NEXT: [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 4 544; CHECK-NEXT: [[S:%.*]] = extractelement <2 x float> [[L]], i32 0 545; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 546; CHECK-NEXT: ret <4 x float> [[R]] 547; 548 %l = load <2 x float>, ptr %p, align 4 549 %s = extractelement <2 x float> %l, i32 0 550 %r = insertelement <4 x float> undef, float %s, i32 0 551 ret <4 x float> %r 552} 553 554define <4 x float> @load_v8f32_extract_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync { 555; SSE2-LABEL: @load_v8f32_extract_insert_v4f32( 556; SSE2-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x float>, ptr [[P:%.*]], i32 0, i32 0 557; SSE2-NEXT: [[S:%.*]] = load float, ptr [[TMP1]], align 4 558; SSE2-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 559; SSE2-NEXT: ret <4 x float> [[R]] 560; 561; AVX2-LABEL: @load_v8f32_extract_insert_v4f32( 562; AVX2-NEXT: [[L:%.*]] = load <8 x float>, ptr [[P:%.*]], align 4 563; AVX2-NEXT: [[S:%.*]] = extractelement <8 x float> [[L]], i32 0 564; AVX2-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 565; AVX2-NEXT: ret <4 x float> [[R]] 566; 567 %l = load <8 x float>, ptr %p, align 4 568 %s = extractelement <8 x float> %l, i32 0 569 %r = insertelement <4 x float> undef, float %s, i32 0 570 ret <4 x float> %r 571} 572 573define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 dereferenceable(16) %p, ptr %store_ptr) nofree nosync { 574; CHECK-LABEL: @load_v1i32_extract_insert_v8i32_extra_use( 575; CHECK-NEXT: [[L:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4 576; CHECK-NEXT: store <1 x i32> [[L]], ptr [[STORE_PTR:%.*]], align 4 577; CHECK-NEXT: [[S:%.*]] = extractelement <1 x i32> [[L]], i32 0 578; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0 579; CHECK-NEXT: ret <8 x i32> [[R]] 580; 581 %l = load <1 x i32>, ptr %p, align 4 582 store <1 x i32> %l, ptr %store_ptr 583 %s = extractelement <1 x i32> %l, i32 0 584 %r = insertelement <8 x i32> undef, i32 %s, i32 0 585 ret <8 x i32> %r 586} 587 588; Can't safely load the offset vector, but can load+shuffle if it is profitable. 589 590define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(ptr align 1 dereferenceable(16) %p) nofree nosync { 591; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16( 592; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, ptr [[P:%.*]], i64 1 593; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, ptr [[GEP]], i32 0, i32 0 594; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[TMP1]], align 8 595; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 596; CHECK-NEXT: ret <8 x i16> [[R]] 597; 598 %gep = getelementptr inbounds <2 x i16>, ptr %p, i64 1 599 %l = load <2 x i16>, ptr %gep, align 8 600 %s = extractelement <2 x i16> %l, i32 0 601 %r = insertelement <8 x i16> undef, i16 %s, i64 0 602 ret <8 x i16> %r 603} 604 605; PR30986 - split vector loads for scalarized operations 606define <2 x i64> @PR30986(ptr %0) { 607; CHECK-LABEL: @PR30986( 608; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <2 x i64>, ptr [[TMP0:%.*]], i32 0, i32 0 609; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 16 610; CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP3]]) 611; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 612; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <2 x i64>, ptr [[TMP0]], i32 0, i32 1 613; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8 614; CHECK-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP7]]) 615; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP8]], i32 1 616; CHECK-NEXT: ret <2 x i64> [[TMP9]] 617; 618 %2 = load <2 x i64>, ptr %0, align 16 619 %3 = extractelement <2 x i64> %2, i32 0 620 %4 = tail call i64 @llvm.ctpop.i64(i64 %3) 621 %5 = insertelement <2 x i64> undef, i64 %4, i32 0 622 %6 = extractelement <2 x i64> %2, i32 1 623 %7 = tail call i64 @llvm.ctpop.i64(i64 %6) 624 %8 = insertelement <2 x i64> %5, i64 %7, i32 1 625 ret <2 x i64> %8 626} 627declare i64 @llvm.ctpop.i64(i64) 628