1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL 4; 5; Just one 32-bit run to make sure we do reasonable things. 6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X86-AVX512F 7 8define <8 x double> @merge_8f64_2f64_12u4(ptr %ptr) nounwind uwtable noinline ssp { 9; ALL-LABEL: merge_8f64_2f64_12u4: 10; ALL: # %bb.0: 11; ALL-NEXT: vmovups 16(%rdi), %zmm0 12; ALL-NEXT: retq 13; 14; X86-AVX512F-LABEL: merge_8f64_2f64_12u4: 15; X86-AVX512F: # %bb.0: 16; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 17; X86-AVX512F-NEXT: vmovups 16(%eax), %zmm0 18; X86-AVX512F-NEXT: retl 19 %ptr0 = getelementptr inbounds <2 x double>, ptr %ptr, i64 1 20 %ptr1 = getelementptr inbounds <2 x double>, ptr %ptr, i64 2 21 %ptr3 = getelementptr inbounds <2 x double>, ptr %ptr, i64 4 22 %val0 = load <2 x double>, ptr %ptr0 23 %val1 = load <2 x double>, ptr %ptr1 24 %val3 = load <2 x double>, ptr %ptr3 25 %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 26 %res23 = shufflevector <2 x double> undef, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 27 %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 28 ret <8 x double> %res 29} 30 31define <8 x double> @merge_8f64_2f64_23z5(ptr %ptr) nounwind uwtable noinline ssp { 32; ALL-LABEL: merge_8f64_2f64_23z5: 33; ALL: # %bb.0: 34; ALL-NEXT: vmovdqu64 32(%rdi), %zmm0 35; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 36; ALL-NEXT: retq 37; 38; X86-AVX512F-LABEL: merge_8f64_2f64_23z5: 39; X86-AVX512F: # %bb.0: 40; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 41; X86-AVX512F-NEXT: vmovdqu64 32(%eax), %zmm0 42; X86-AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 43; X86-AVX512F-NEXT: retl 44 %ptr0 = getelementptr inbounds <2 x double>, ptr %ptr, i64 2 45 %ptr1 = getelementptr inbounds <2 x double>, ptr %ptr, i64 3 46 %ptr3 = getelementptr inbounds <2 x double>, ptr %ptr, i64 5 47 %val0 = load <2 x double>, ptr %ptr0 48 %val1 = load <2 x double>, ptr %ptr1 49 %val3 = load <2 x double>, ptr %ptr3 50 %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 51 %res23 = shufflevector <2 x double> zeroinitializer, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 52 %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 53 ret <8 x double> %res 54} 55 56define <8 x double> @merge_8f64_4f64_z2(ptr %ptr) nounwind uwtable noinline ssp { 57; ALL-LABEL: merge_8f64_4f64_z2: 58; ALL: # %bb.0: 59; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 60; ALL-NEXT: vinsertf64x4 $1, 64(%rdi), %zmm0, %zmm0 61; ALL-NEXT: retq 62; 63; X86-AVX512F-LABEL: merge_8f64_4f64_z2: 64; X86-AVX512F: # %bb.0: 65; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 66; X86-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 67; X86-AVX512F-NEXT: vinsertf64x4 $1, 64(%eax), %zmm0, %zmm0 68; X86-AVX512F-NEXT: retl 69 %ptr1 = getelementptr inbounds <4 x double>, ptr %ptr, i64 2 70 %val1 = load <4 x double>, ptr %ptr1 71 %res = shufflevector <4 x double> zeroinitializer, <4 x double> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 72 ret <8 x double> %res 73} 74 75define <8 x double> @merge_8f64_f64_23uuuuu9(ptr %ptr) nounwind uwtable noinline ssp { 76; ALL-LABEL: merge_8f64_f64_23uuuuu9: 77; ALL: # %bb.0: 78; ALL-NEXT: vmovups 16(%rdi), %zmm0 79; ALL-NEXT: retq 80; 81; X86-AVX512F-LABEL: merge_8f64_f64_23uuuuu9: 82; X86-AVX512F: # %bb.0: 83; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 84; X86-AVX512F-NEXT: vmovups 16(%eax), %zmm0 85; X86-AVX512F-NEXT: retl 86 %ptr0 = getelementptr inbounds double, ptr %ptr, i64 2 87 %ptr1 = getelementptr inbounds double, ptr %ptr, i64 3 88 %ptr7 = getelementptr inbounds double, ptr %ptr, i64 9 89 %val0 = load double, ptr %ptr0 90 %val1 = load double, ptr %ptr1 91 %val7 = load double, ptr %ptr7 92 %res0 = insertelement <8 x double> undef, double %val0, i32 0 93 %res1 = insertelement <8 x double> %res0, double %val1, i32 1 94 %res7 = insertelement <8 x double> %res1, double %val7, i32 7 95 ret <8 x double> %res7 96} 97 98define <8 x double> @merge_8f64_f64_12zzuuzz(ptr %ptr) nounwind uwtable noinline ssp { 99; ALL-LABEL: merge_8f64_f64_12zzuuzz: 100; ALL: # %bb.0: 101; ALL-NEXT: vmovups 8(%rdi), %xmm0 102; ALL-NEXT: retq 103; 104; X86-AVX512F-LABEL: merge_8f64_f64_12zzuuzz: 105; X86-AVX512F: # %bb.0: 106; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 107; X86-AVX512F-NEXT: vmovups 8(%eax), %xmm0 108; X86-AVX512F-NEXT: retl 109 %ptr0 = getelementptr inbounds double, ptr %ptr, i64 1 110 %ptr1 = getelementptr inbounds double, ptr %ptr, i64 2 111 %val0 = load double, ptr %ptr0 112 %val1 = load double, ptr %ptr1 113 %res0 = insertelement <8 x double> undef, double %val0, i32 0 114 %res1 = insertelement <8 x double> %res0, double %val1, i32 1 115 %res2 = insertelement <8 x double> %res1, double 0.0, i32 2 116 %res3 = insertelement <8 x double> %res2, double 0.0, i32 3 117 %res6 = insertelement <8 x double> %res3, double 0.0, i32 6 118 %res7 = insertelement <8 x double> %res6, double 0.0, i32 7 119 ret <8 x double> %res7 120} 121 122define <8 x double> @merge_8f64_f64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ssp { 123; ALL-LABEL: merge_8f64_f64_1u3u5zu8: 124; ALL: # %bb.0: 125; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 126; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 127; ALL-NEXT: retq 128; 129; X86-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8: 130; X86-AVX512F: # %bb.0: 131; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 132; X86-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 133; X86-AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 134; X86-AVX512F-NEXT: retl 135 %ptr0 = getelementptr inbounds double, ptr %ptr, i64 1 136 %ptr2 = getelementptr inbounds double, ptr %ptr, i64 3 137 %ptr4 = getelementptr inbounds double, ptr %ptr, i64 5 138 %ptr7 = getelementptr inbounds double, ptr %ptr, i64 8 139 %val0 = load double, ptr %ptr0 140 %val2 = load double, ptr %ptr2 141 %val4 = load double, ptr %ptr4 142 %val7 = load double, ptr %ptr7 143 %res0 = insertelement <8 x double> undef, double %val0, i32 0 144 %res2 = insertelement <8 x double> %res0, double %val2, i32 2 145 %res4 = insertelement <8 x double> %res2, double %val4, i32 4 146 %res5 = insertelement <8 x double> %res4, double 0.0, i32 5 147 %res7 = insertelement <8 x double> %res5, double %val7, i32 7 148 ret <8 x double> %res7 149} 150 151define <8 x i64> @merge_8i64_4i64_z3(ptr %ptr) nounwind uwtable noinline ssp { 152; ALL-LABEL: merge_8i64_4i64_z3: 153; ALL: # %bb.0: 154; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 155; ALL-NEXT: vinsertf64x4 $1, 96(%rdi), %zmm0, %zmm0 156; ALL-NEXT: retq 157; 158; X86-AVX512F-LABEL: merge_8i64_4i64_z3: 159; X86-AVX512F: # %bb.0: 160; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 161; X86-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 162; X86-AVX512F-NEXT: vinsertf64x4 $1, 96(%eax), %zmm0, %zmm0 163; X86-AVX512F-NEXT: retl 164 %ptr1 = getelementptr inbounds <4 x i64>, ptr %ptr, i64 3 165 %val1 = load <4 x i64>, ptr %ptr1 166 %res = shufflevector <4 x i64> zeroinitializer, <4 x i64> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 167 ret <8 x i64> %res 168} 169 170define <8 x i64> @merge_8i64_i64_56zz9uzz(ptr %ptr) nounwind uwtable noinline ssp { 171; ALL-LABEL: merge_8i64_i64_56zz9uzz: 172; ALL: # %bb.0: 173; ALL-NEXT: vmovups 40(%rdi), %xmm0 174; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 175; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 176; ALL-NEXT: retq 177; 178; X86-AVX512F-LABEL: merge_8i64_i64_56zz9uzz: 179; X86-AVX512F: # %bb.0: 180; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 181; X86-AVX512F-NEXT: vmovups 40(%eax), %xmm0 182; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 183; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 184; X86-AVX512F-NEXT: retl 185 %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 5 186 %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 6 187 %ptr4 = getelementptr inbounds i64, ptr %ptr, i64 9 188 %val0 = load i64, ptr %ptr0 189 %val1 = load i64, ptr %ptr1 190 %val4 = load i64, ptr %ptr4 191 %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0 192 %res1 = insertelement <8 x i64> %res0, i64 %val1, i32 1 193 %res2 = insertelement <8 x i64> %res1, i64 0, i32 2 194 %res3 = insertelement <8 x i64> %res2, i64 0, i32 3 195 %res4 = insertelement <8 x i64> %res3, i64 %val4, i32 4 196 %res6 = insertelement <8 x i64> %res4, i64 0, i32 6 197 %res7 = insertelement <8 x i64> %res6, i64 0, i32 7 198 ret <8 x i64> %res7 199} 200 201define <8 x i64> @merge_8i64_i64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ssp { 202; ALL-LABEL: merge_8i64_i64_1u3u5zu8: 203; ALL: # %bb.0: 204; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 205; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 206; ALL-NEXT: retq 207; 208; X86-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8: 209; X86-AVX512F: # %bb.0: 210; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 211; X86-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 212; X86-AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 213; X86-AVX512F-NEXT: retl 214 %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 1 215 %ptr2 = getelementptr inbounds i64, ptr %ptr, i64 3 216 %ptr4 = getelementptr inbounds i64, ptr %ptr, i64 5 217 %ptr7 = getelementptr inbounds i64, ptr %ptr, i64 8 218 %val0 = load i64, ptr %ptr0 219 %val2 = load i64, ptr %ptr2 220 %val4 = load i64, ptr %ptr4 221 %val7 = load i64, ptr %ptr7 222 %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0 223 %res2 = insertelement <8 x i64> %res0, i64 %val2, i32 2 224 %res4 = insertelement <8 x i64> %res2, i64 %val4, i32 4 225 %res5 = insertelement <8 x i64> %res4, i64 0, i32 5 226 %res7 = insertelement <8 x i64> %res5, i64 %val7, i32 7 227 ret <8 x i64> %res7 228} 229 230define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp { 231; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: 232; ALL: # %bb.0: 233; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 234; ALL-NEXT: retq 235; 236; X86-AVX512F-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: 237; X86-AVX512F: # %bb.0: 238; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 239; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 240; X86-AVX512F-NEXT: retl 241 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 8 242 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 9 243 %val0 = load float, ptr %ptr0 244 %val1 = load float, ptr %ptr1 245 %res0 = insertelement <16 x float> undef, float %val0, i32 0 246 %res1 = insertelement <16 x float> %res0, float %val1, i32 1 247 %res2 = insertelement <16 x float> %res1, float 0.0, i32 2 248 %res3 = insertelement <16 x float> %res2, float 0.0, i32 3 249 %res4 = insertelement <16 x float> %res3, float 0.0, i32 4 250 %resF = insertelement <16 x float> %res4, float 0.0, i32 15 251 ret <16 x float> %resF 252} 253 254define <16 x float> @merge_16f32_f32_45u7uuuuuuuuuuuu(ptr %ptr) nounwind uwtable noinline ssp { 255; ALL-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu: 256; ALL: # %bb.0: 257; ALL-NEXT: vmovups 16(%rdi), %xmm0 258; ALL-NEXT: retq 259; 260; X86-AVX512F-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu: 261; X86-AVX512F: # %bb.0: 262; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 263; X86-AVX512F-NEXT: vmovups 16(%eax), %xmm0 264; X86-AVX512F-NEXT: retl 265 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 4 266 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 5 267 %ptr3 = getelementptr inbounds float, ptr %ptr, i64 7 268 %val0 = load float, ptr %ptr0 269 %val1 = load float, ptr %ptr1 270 %val3 = load float, ptr %ptr3 271 %res0 = insertelement <16 x float> undef, float %val0, i32 0 272 %res1 = insertelement <16 x float> %res0, float %val1, i32 1 273 %res3 = insertelement <16 x float> %res1, float %val3, i32 3 274 ret <16 x float> %res3 275} 276 277define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(ptr %ptr) nounwind uwtable noinline ssp { 278; ALL-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF: 279; ALL: # %bb.0: 280; ALL-NEXT: vmovups (%rdi), %zmm0 281; ALL-NEXT: retq 282; 283; X86-AVX512F-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF: 284; X86-AVX512F: # %bb.0: 285; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 286; X86-AVX512F-NEXT: vmovups (%eax), %zmm0 287; X86-AVX512F-NEXT: retl 288 %ptr3 = getelementptr inbounds float, ptr %ptr, i64 3 289 %ptrC = getelementptr inbounds float, ptr %ptr, i64 12 290 %ptrE = getelementptr inbounds float, ptr %ptr, i64 14 291 %ptrF = getelementptr inbounds float, ptr %ptr, i64 15 292 %val0 = load float, ptr %ptr 293 %val3 = load float, ptr %ptr3 294 %valC = load float, ptr %ptrC 295 %valE = load float, ptr %ptrE 296 %valF = load float, ptr %ptrF 297 %res0 = insertelement <16 x float> undef, float %val0, i32 0 298 %res3 = insertelement <16 x float> %res0, float %val3, i32 3 299 %resC = insertelement <16 x float> %res3, float %valC, i32 12 300 %resE = insertelement <16 x float> %resC, float %valE, i32 14 301 %resF = insertelement <16 x float> %resE, float %valF, i32 15 302 ret <16 x float> %resF 303} 304 305define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable noinline ssp { 306; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: 307; ALL: # %bb.0: 308; ALL-NEXT: vmovdqu64 (%rdi), %zmm0 309; ALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 310; ALL-NEXT: retq 311; 312; X86-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: 313; X86-AVX512F: # %bb.0: 314; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 315; X86-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0 316; X86-AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 317; X86-AVX512F-NEXT: retl 318 %ptr3 = getelementptr inbounds float, ptr %ptr, i64 3 319 %ptrC = getelementptr inbounds float, ptr %ptr, i64 12 320 %ptrE = getelementptr inbounds float, ptr %ptr, i64 14 321 %ptrF = getelementptr inbounds float, ptr %ptr, i64 15 322 %val0 = load float, ptr %ptr 323 %val3 = load float, ptr %ptr3 324 %valC = load float, ptr %ptrC 325 %valE = load float, ptr %ptrE 326 %valF = load float, ptr %ptrF 327 %res0 = insertelement <16 x float> undef, float %val0, i32 0 328 %res3 = insertelement <16 x float> %res0, float %val3, i32 3 329 %res4 = insertelement <16 x float> %res3, float 0.0, i32 4 330 %res5 = insertelement <16 x float> %res4, float 0.0, i32 5 331 %resC = insertelement <16 x float> %res5, float %valC, i32 12 332 %resD = insertelement <16 x float> %resC, float 0.0, i32 13 333 %resE = insertelement <16 x float> %resD, float %valE, i32 14 334 %resF = insertelement <16 x float> %resE, float %valF, i32 15 335 ret <16 x float> %resF 336} 337 338define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp { 339; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz: 340; ALL: # %bb.0: 341; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 342; ALL-NEXT: retq 343; 344; X86-AVX512F-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz: 345; X86-AVX512F: # %bb.0: 346; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 347; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 348; X86-AVX512F-NEXT: retl 349 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 1 350 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 2 351 %val0 = load i32, ptr %ptr0 352 %val1 = load i32, ptr %ptr1 353 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 354 %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1 355 %res2 = insertelement <16 x i32> %res1, i32 0, i32 2 356 %res3 = insertelement <16 x i32> %res2, i32 0, i32 3 357 %res4 = insertelement <16 x i32> %res3, i32 0, i32 4 358 %resF = insertelement <16 x i32> %res4, i32 0, i32 15 359 ret <16 x i32> %resF 360} 361 362define <16 x i32> @merge_16i32_i32_23u5uuuuuuuuuuuu(ptr %ptr) nounwind uwtable noinline ssp { 363; ALL-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu: 364; ALL: # %bb.0: 365; ALL-NEXT: vmovups 8(%rdi), %xmm0 366; ALL-NEXT: retq 367; 368; X86-AVX512F-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu: 369; X86-AVX512F: # %bb.0: 370; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 371; X86-AVX512F-NEXT: vmovups 8(%eax), %xmm0 372; X86-AVX512F-NEXT: retl 373 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 2 374 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 3 375 %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 5 376 %val0 = load i32, ptr %ptr0 377 %val1 = load i32, ptr %ptr1 378 %val3 = load i32, ptr %ptr3 379 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 380 %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1 381 %res3 = insertelement <16 x i32> %res1, i32 %val3, i32 3 382 ret <16 x i32> %res3 383} 384 385define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(ptr %ptr) nounwind uwtable noinline ssp { 386; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF: 387; ALL: # %bb.0: 388; ALL-NEXT: vmovups (%rdi), %zmm0 389; ALL-NEXT: retq 390; 391; X86-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF: 392; X86-AVX512F: # %bb.0: 393; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 394; X86-AVX512F-NEXT: vmovups (%eax), %zmm0 395; X86-AVX512F-NEXT: retl 396 %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 3 397 %ptrC = getelementptr inbounds i32, ptr %ptr, i64 12 398 %ptrE = getelementptr inbounds i32, ptr %ptr, i64 14 399 %ptrF = getelementptr inbounds i32, ptr %ptr, i64 15 400 %val0 = load i32, ptr %ptr 401 %val3 = load i32, ptr %ptr3 402 %valC = load i32, ptr %ptrC 403 %valE = load i32, ptr %ptrE 404 %valF = load i32, ptr %ptrF 405 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 406 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3 407 %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12 408 %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14 409 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15 410 ret <16 x i32> %resF 411} 412 413define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable noinline ssp { 414; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: 415; ALL: # %bb.0: 416; ALL-NEXT: vmovdqu64 (%rdi), %zmm0 417; ALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 418; ALL-NEXT: retq 419; 420; X86-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: 421; X86-AVX512F: # %bb.0: 422; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 423; X86-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0 424; X86-AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 425; X86-AVX512F-NEXT: retl 426 %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 3 427 %ptrC = getelementptr inbounds i32, ptr %ptr, i64 12 428 %ptrE = getelementptr inbounds i32, ptr %ptr, i64 14 429 %ptrF = getelementptr inbounds i32, ptr %ptr, i64 15 430 %val0 = load i32, ptr %ptr 431 %val3 = load i32, ptr %ptr3 432 %valC = load i32, ptr %ptrC 433 %valE = load i32, ptr %ptrE 434 %valF = load i32, ptr %ptrF 435 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 436 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3 437 %res4 = insertelement <16 x i32> %res3, i32 0, i32 4 438 %res5 = insertelement <16 x i32> %res4, i32 0, i32 5 439 %resC = insertelement <16 x i32> %res5, i32 %valC, i32 12 440 %resD = insertelement <16 x i32> %resC, i32 0, i32 13 441 %resE = insertelement <16 x i32> %resD, i32 %valE, i32 14 442 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15 443 ret <16 x i32> %resF 444} 445 446define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(ptr %ptr) nounwind uwtable noinline ssp { 447; ALL-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: 448; ALL: # %bb.0: 449; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 450; ALL-NEXT: retq 451; 452; X86-AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: 453; X86-AVX512F: # %bb.0: 454; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 455; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 456; X86-AVX512F-NEXT: retl 457 %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 1 458 %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 2 459 %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 4 460 %val0 = load i16, ptr %ptr0 461 %val1 = load i16, ptr %ptr1 462 %val3 = load i16, ptr %ptr3 463 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0 464 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1 465 %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3 466 %res30 = insertelement <32 x i16> %res3, i16 0, i16 30 467 %res31 = insertelement <32 x i16> %res30, i16 0, i16 31 468 ret <32 x i16> %res31 469} 470 471define <32 x i16> @merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(ptr %ptr) nounwind uwtable noinline ssp { 472; ALL-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: 473; ALL: # %bb.0: 474; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 475; ALL-NEXT: retq 476; 477; X86-AVX512F-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: 478; X86-AVX512F: # %bb.0: 479; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 480; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 481; X86-AVX512F-NEXT: retl 482 %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 4 483 %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 5 484 %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 7 485 %val0 = load i16, ptr %ptr0 486 %val1 = load i16, ptr %ptr1 487 %val3 = load i16, ptr %ptr3 488 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0 489 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1 490 %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3 491 ret <32 x i16> %res3 492} 493 494define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(ptr %ptr) nounwind uwtable noinline ssp { 495; ALL-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: 496; ALL: # %bb.0: 497; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 498; ALL-NEXT: retq 499; 500; X86-AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: 501; X86-AVX512F: # %bb.0: 502; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 503; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 504; X86-AVX512F-NEXT: retl 505 %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 2 506 %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 3 507 %val0 = load i16, ptr %ptr0 508 %val1 = load i16, ptr %ptr1 509 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0 510 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1 511 %res3 = insertelement <32 x i16> %res1, i16 0, i16 3 512 %resE = insertelement <32 x i16> %res3, i16 0, i16 14 513 %resF = insertelement <32 x i16> %resE, i16 0, i16 15 514 %resG = insertelement <32 x i16> %resF, i16 0, i16 16 515 %resH = insertelement <32 x i16> %resG, i16 0, i16 17 516 ret <32 x i16> %resH 517} 518 519define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp { 520; ALL-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 521; ALL: # %bb.0: 522; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 523; ALL-NEXT: retq 524; 525; X86-AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 526; X86-AVX512F: # %bb.0: 527; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 528; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 529; X86-AVX512F-NEXT: retl 530 %ptr0 = getelementptr inbounds i8, ptr %ptr, i64 1 531 %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 2 532 %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 4 533 %ptr7 = getelementptr inbounds i8, ptr %ptr, i64 8 534 %val0 = load i8, ptr %ptr0 535 %val1 = load i8, ptr %ptr1 536 %val3 = load i8, ptr %ptr3 537 %val7 = load i8, ptr %ptr7 538 %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0 539 %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1 540 %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3 541 %res7 = insertelement <64 x i8> %res3, i8 %val7, i8 7 542 %res14 = insertelement <64 x i8> %res7, i8 0, i8 14 543 %res15 = insertelement <64 x i8> %res14, i8 0, i8 15 544 %res16 = insertelement <64 x i8> %res15, i8 0, i8 16 545 %res17 = insertelement <64 x i8> %res16, i8 0, i8 17 546 %res63 = insertelement <64 x i8> %res17, i8 0, i8 63 547 ret <64 x i8> %res63 548} 549 550define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp { 551; ALL-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 552; ALL: # %bb.0: 553; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 554; ALL-NEXT: retq 555; 556; X86-AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 557; X86-AVX512F: # %bb.0: 558; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 559; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 560; X86-AVX512F-NEXT: retl 561 %ptr0 = getelementptr inbounds i8, ptr %ptr, i64 1 562 %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 2 563 %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 4 564 %val0 = load i8, ptr %ptr0 565 %val1 = load i8, ptr %ptr1 566 %val3 = load i8, ptr %ptr3 567 %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0 568 %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1 569 %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3 570 %res14 = insertelement <64 x i8> %res3, i8 0, i8 14 571 %res15 = insertelement <64 x i8> %res14, i8 0, i8 15 572 %res16 = insertelement <64 x i8> %res15, i8 0, i8 16 573 %res17 = insertelement <64 x i8> %res16, i8 0, i8 17 574 %res63 = insertelement <64 x i8> %res17, i8 0, i8 63 575 ret <64 x i8> %res63 576} 577 578; 579; consecutive loads including any/all volatiles may not be combined 580; 581 582define <8 x double> @merge_8f64_f64_23uuuuu9_volatile(ptr %ptr) nounwind uwtable noinline ssp { 583; ALL-LABEL: merge_8f64_f64_23uuuuu9_volatile: 584; ALL: # %bb.0: 585; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 586; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 587; ALL-NEXT: vbroadcastsd 72(%rdi), %ymm1 588; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 589; ALL-NEXT: retq 590; 591; X86-AVX512F-LABEL: merge_8f64_f64_23uuuuu9_volatile: 592; X86-AVX512F: # %bb.0: 593; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 594; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 595; X86-AVX512F-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 596; X86-AVX512F-NEXT: vbroadcastsd 72(%eax), %ymm1 597; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 598; X86-AVX512F-NEXT: retl 599 %ptr0 = getelementptr inbounds double, ptr %ptr, i64 2 600 %ptr1 = getelementptr inbounds double, ptr %ptr, i64 3 601 %ptr7 = getelementptr inbounds double, ptr %ptr, i64 9 602 %val0 = load volatile double, ptr %ptr0 603 %val1 = load double, ptr %ptr1 604 %val7 = load double, ptr %ptr7 605 %res0 = insertelement <8 x double> undef, double %val0, i32 0 606 %res1 = insertelement <8 x double> %res0, double %val1, i32 1 607 %res7 = insertelement <8 x double> %res1, double %val7, i32 7 608 ret <8 x double> %res7 609} 610 611define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile(ptr %ptr) nounwind uwtable noinline ssp { 612; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile: 613; ALL: # %bb.0: 614; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 615; ALL-NEXT: vpinsrd $3, 12(%rdi), %xmm0, %xmm0 616; ALL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 617; ALL-NEXT: vpinsrd $2, 56(%rdi), %xmm1, %xmm1 618; ALL-NEXT: vpinsrd $3, 60(%rdi), %xmm1, %xmm1 619; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 620; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 621; ALL-NEXT: retq 622; 623; X86-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile: 624; X86-AVX512F: # %bb.0: 625; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 626; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 627; X86-AVX512F-NEXT: vpinsrd $3, 12(%eax), %xmm0, %xmm0 628; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 629; X86-AVX512F-NEXT: vpinsrd $2, 56(%eax), %xmm1, %xmm1 630; X86-AVX512F-NEXT: vpinsrd $3, 60(%eax), %xmm1, %xmm1 631; X86-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 632; X86-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 633; X86-AVX512F-NEXT: retl 634 %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 3 635 %ptrC = getelementptr inbounds i32, ptr %ptr, i64 12 636 %ptrE = getelementptr inbounds i32, ptr %ptr, i64 14 637 %ptrF = getelementptr inbounds i32, ptr %ptr, i64 15 638 %val0 = load volatile i32, ptr %ptr 639 %val3 = load volatile i32, ptr %ptr3 640 %valC = load volatile i32, ptr %ptrC 641 %valE = load volatile i32, ptr %ptrE 642 %valF = load volatile i32, ptr %ptrF 643 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 644 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3 645 %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12 646 %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14 647 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15 648 ret <16 x i32> %resF 649} 650