1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX 7; 8; 32-bit SSE tests to make sure we do reasonable things. 9; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86-SSE,X86-SSE1 10; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86-SSE,X86-SSE41 11 12define <2 x double> @merge_2f64_f64_23(ptr %ptr) nounwind uwtable noinline ssp { 13; SSE-LABEL: merge_2f64_f64_23: 14; SSE: # %bb.0: 15; SSE-NEXT: movups 16(%rdi), %xmm0 16; SSE-NEXT: retq 17; 18; AVX-LABEL: merge_2f64_f64_23: 19; AVX: # %bb.0: 20; AVX-NEXT: vmovups 16(%rdi), %xmm0 21; AVX-NEXT: retq 22; 23; X86-SSE1-LABEL: merge_2f64_f64_23: 24; X86-SSE1: # %bb.0: 25; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 26; X86-SSE1-NEXT: fldl 16(%eax) 27; X86-SSE1-NEXT: fldl 24(%eax) 28; X86-SSE1-NEXT: fxch %st(1) 29; X86-SSE1-NEXT: retl 30; 31; X86-SSE41-LABEL: merge_2f64_f64_23: 32; X86-SSE41: # %bb.0: 33; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 34; X86-SSE41-NEXT: movups 16(%eax), %xmm0 35; X86-SSE41-NEXT: retl 36 %ptr0 = getelementptr inbounds double, ptr %ptr, i64 2 37 %ptr1 = getelementptr inbounds double, ptr %ptr, i64 3 38 %val0 = load double, ptr %ptr0 39 %val1 = load double, ptr %ptr1 40 %res0 = insertelement <2 x double> undef, double %val0, i32 0 41 %res1 = insertelement <2 x double> %res0, double %val1, i32 1 42 ret <2 x double> %res1 43} 44 45define <2 x i64> @merge_2i64_i64_12(ptr %ptr) nounwind uwtable noinline ssp { 46; SSE-LABEL: merge_2i64_i64_12: 47; SSE: # %bb.0: 48; SSE-NEXT: movups 8(%rdi), %xmm0 49; SSE-NEXT: retq 50; 51; AVX-LABEL: merge_2i64_i64_12: 52; AVX: # %bb.0: 53; AVX-NEXT: vmovups 8(%rdi), %xmm0 54; AVX-NEXT: retq 55; 56; X86-SSE1-LABEL: merge_2i64_i64_12: 57; X86-SSE1: # %bb.0: 58; X86-SSE1-NEXT: pushl %edi 59; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 60; X86-SSE1-NEXT: pushl %esi 61; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 62; X86-SSE1-NEXT: .cfi_offset %esi, -12 63; X86-SSE1-NEXT: .cfi_offset %edi, -8 64; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 65; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 66; X86-SSE1-NEXT: movl 8(%ecx), %edx 67; X86-SSE1-NEXT: movl 12(%ecx), %esi 68; X86-SSE1-NEXT: movl 16(%ecx), %edi 69; X86-SSE1-NEXT: movl 20(%ecx), %ecx 70; X86-SSE1-NEXT: movl %ecx, 12(%eax) 71; X86-SSE1-NEXT: movl %edi, 8(%eax) 72; X86-SSE1-NEXT: movl %esi, 4(%eax) 73; X86-SSE1-NEXT: movl %edx, (%eax) 74; X86-SSE1-NEXT: popl %esi 75; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 76; X86-SSE1-NEXT: popl %edi 77; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 78; X86-SSE1-NEXT: retl $4 79; 80; X86-SSE41-LABEL: merge_2i64_i64_12: 81; X86-SSE41: # %bb.0: 82; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 83; X86-SSE41-NEXT: movups 8(%eax), %xmm0 84; X86-SSE41-NEXT: retl 85 %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 1 86 %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 2 87 %val0 = load i64, ptr %ptr0 88 %val1 = load i64, ptr %ptr1 89 %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0 90 %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1 91 ret <2 x i64> %res1 92} 93 94define <4 x float> @merge_4f32_f32_2345(ptr %ptr) nounwind uwtable noinline ssp { 95; SSE-LABEL: merge_4f32_f32_2345: 96; SSE: # %bb.0: 97; SSE-NEXT: movups 8(%rdi), %xmm0 98; SSE-NEXT: retq 99; 100; AVX-LABEL: merge_4f32_f32_2345: 101; AVX: # %bb.0: 102; AVX-NEXT: vmovups 8(%rdi), %xmm0 103; AVX-NEXT: retq 104; 105; X86-SSE-LABEL: merge_4f32_f32_2345: 106; X86-SSE: # %bb.0: 107; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 108; X86-SSE-NEXT: movups 8(%eax), %xmm0 109; X86-SSE-NEXT: retl 110 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 2 111 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 3 112 %ptr2 = getelementptr inbounds float, ptr %ptr, i64 4 113 %ptr3 = getelementptr inbounds float, ptr %ptr, i64 5 114 %val0 = load float, ptr %ptr0 115 %val1 = load float, ptr %ptr1 116 %val2 = load float, ptr %ptr2 117 %val3 = load float, ptr %ptr3 118 %res0 = insertelement <4 x float> undef, float %val0, i32 0 119 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 120 %res2 = insertelement <4 x float> %res1, float %val2, i32 2 121 %res3 = insertelement <4 x float> %res2, float %val3, i32 3 122 ret <4 x float> %res3 123} 124 125define <4 x float> @merge_4f32_f32_3zuu(ptr %ptr) nounwind uwtable noinline ssp { 126; SSE-LABEL: merge_4f32_f32_3zuu: 127; SSE: # %bb.0: 128; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 129; SSE-NEXT: retq 130; 131; AVX-LABEL: merge_4f32_f32_3zuu: 132; AVX: # %bb.0: 133; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 134; AVX-NEXT: retq 135; 136; X86-SSE-LABEL: merge_4f32_f32_3zuu: 137; X86-SSE: # %bb.0: 138; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 139; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 140; X86-SSE-NEXT: retl 141 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3 142 %val0 = load float, ptr %ptr0 143 %res0 = insertelement <4 x float> undef, float %val0, i32 0 144 %res1 = insertelement <4 x float> %res0, float 0.0, i32 1 145 ret <4 x float> %res1 146} 147 148define <4 x float> @merge_4f32_f32_34uu(ptr %ptr) nounwind uwtable noinline ssp { 149; SSE-LABEL: merge_4f32_f32_34uu: 150; SSE: # %bb.0: 151; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 152; SSE-NEXT: retq 153; 154; AVX-LABEL: merge_4f32_f32_34uu: 155; AVX: # %bb.0: 156; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 157; AVX-NEXT: retq 158; 159; X86-SSE1-LABEL: merge_4f32_f32_34uu: 160; X86-SSE1: # %bb.0: 161; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 162; X86-SSE1-NEXT: xorps %xmm0, %xmm0 163; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 164; X86-SSE1-NEXT: retl 165; 166; X86-SSE41-LABEL: merge_4f32_f32_34uu: 167; X86-SSE41: # %bb.0: 168; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 169; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 170; X86-SSE41-NEXT: retl 171 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3 172 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 4 173 %val0 = load float, ptr %ptr0 174 %val1 = load float, ptr %ptr1 175 %res0 = insertelement <4 x float> undef, float %val0, i32 0 176 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 177 ret <4 x float> %res1 178} 179 180define <4 x float> @merge_4f32_f32_34z6(ptr %ptr) nounwind uwtable noinline ssp { 181; SSE2-LABEL: merge_4f32_f32_34z6: 182; SSE2: # %bb.0: 183; SSE2-NEXT: movups 12(%rdi), %xmm0 184; SSE2-NEXT: xorps %xmm1, %xmm1 185; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] 186; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 187; SSE2-NEXT: retq 188; 189; SSE41-LABEL: merge_4f32_f32_34z6: 190; SSE41: # %bb.0: 191; SSE41-NEXT: movups 12(%rdi), %xmm1 192; SSE41-NEXT: xorps %xmm0, %xmm0 193; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] 194; SSE41-NEXT: retq 195; 196; AVX-LABEL: merge_4f32_f32_34z6: 197; AVX: # %bb.0: 198; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 199; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3] 200; AVX-NEXT: retq 201; 202; X86-SSE1-LABEL: merge_4f32_f32_34z6: 203; X86-SSE1: # %bb.0: 204; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 205; X86-SSE1-NEXT: movups 12(%eax), %xmm0 206; X86-SSE1-NEXT: xorps %xmm1, %xmm1 207; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] 208; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 209; X86-SSE1-NEXT: retl 210; 211; X86-SSE41-LABEL: merge_4f32_f32_34z6: 212; X86-SSE41: # %bb.0: 213; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 214; X86-SSE41-NEXT: movups 12(%eax), %xmm1 215; X86-SSE41-NEXT: xorps %xmm0, %xmm0 216; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] 217; X86-SSE41-NEXT: retl 218 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3 219 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 4 220 %ptr3 = getelementptr inbounds float, ptr %ptr, i64 6 221 %val0 = load float, ptr %ptr0 222 %val1 = load float, ptr %ptr1 223 %val3 = load float, ptr %ptr3 224 %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0 225 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 226 %res3 = insertelement <4 x float> %res1, float %val3, i32 3 227 ret <4 x float> %res3 228} 229 230define <4 x float> @merge_4f32_f32_45zz(ptr %ptr) nounwind uwtable noinline ssp { 231; SSE-LABEL: merge_4f32_f32_45zz: 232; SSE: # %bb.0: 233; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 234; SSE-NEXT: retq 235; 236; AVX-LABEL: merge_4f32_f32_45zz: 237; AVX: # %bb.0: 238; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 239; AVX-NEXT: retq 240; 241; X86-SSE1-LABEL: merge_4f32_f32_45zz: 242; X86-SSE1: # %bb.0: 243; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 244; X86-SSE1-NEXT: xorps %xmm0, %xmm0 245; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 246; X86-SSE1-NEXT: retl 247; 248; X86-SSE41-LABEL: merge_4f32_f32_45zz: 249; X86-SSE41: # %bb.0: 250; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 251; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 252; X86-SSE41-NEXT: retl 253 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 4 254 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 5 255 %val0 = load float, ptr %ptr0 256 %val1 = load float, ptr %ptr1 257 %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0 258 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 259 ret <4 x float> %res1 260} 261 262define <4 x float> @merge_4f32_f32_012u(ptr %ptr) nounwind uwtable noinline ssp { 263; SSE2-LABEL: merge_4f32_f32_012u: 264; SSE2: # %bb.0: 265; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 266; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 267; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 268; SSE2-NEXT: retq 269; 270; SSE41-LABEL: merge_4f32_f32_012u: 271; SSE41: # %bb.0: 272; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 273; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 274; SSE41-NEXT: retq 275; 276; AVX-LABEL: merge_4f32_f32_012u: 277; AVX: # %bb.0: 278; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 279; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 280; AVX-NEXT: retq 281; 282; X86-SSE1-LABEL: merge_4f32_f32_012u: 283; X86-SSE1: # %bb.0: 284; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 285; X86-SSE1-NEXT: xorps %xmm0, %xmm0 286; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 287; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 288; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 289; X86-SSE1-NEXT: retl 290; 291; X86-SSE41-LABEL: merge_4f32_f32_012u: 292; X86-SSE41: # %bb.0: 293; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 294; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 295; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 296; X86-SSE41-NEXT: retl 297 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 1 298 %ptr2 = getelementptr inbounds float, ptr %ptr, i64 2 299 %val0 = load float, ptr %ptr 300 %val1 = load float, ptr %ptr1 301 %val2 = load float, ptr %ptr2 302 %res0 = insertelement <4 x float> undef, float %val0, i32 0 303 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 304 %res2 = insertelement <4 x float> %res1, float %val2, i32 2 305 %res3 = insertelement <4 x float> %res2, float undef, i32 3 306 ret <4 x float> %res3 307} 308 309define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp { 310; SSE2-LABEL: merge_4f32_f32_019u: 311; SSE2: # %bb.0: 312; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 313; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 314; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 315; SSE2-NEXT: retq 316; 317; SSE41-LABEL: merge_4f32_f32_019u: 318; SSE41: # %bb.0: 319; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 320; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 321; SSE41-NEXT: retq 322; 323; AVX-LABEL: merge_4f32_f32_019u: 324; AVX: # %bb.0: 325; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 326; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 327; AVX-NEXT: retq 328; 329; X86-SSE1-LABEL: merge_4f32_f32_019u: 330; X86-SSE1: # %bb.0: 331; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 332; X86-SSE1-NEXT: xorps %xmm0, %xmm0 333; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 334; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 335; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 336; X86-SSE1-NEXT: retl 337; 338; X86-SSE41-LABEL: merge_4f32_f32_019u: 339; X86-SSE41: # %bb.0: 340; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 341; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 342; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 343; X86-SSE41-NEXT: retl 344 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 1 345 %ptr2 = getelementptr inbounds float, ptr %ptr, i64 9 346 %val0 = load float, ptr %ptr 347 %val1 = load float, ptr %ptr1 348 %val2 = load float, ptr %ptr2 349 %res0 = insertelement <4 x float> undef, float %val0, i32 0 350 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 351 %res2 = insertelement <4 x float> %res1, float %val2, i32 2 352 %res3 = insertelement <4 x float> %res2, float undef, i32 3 353 ret <4 x float> %res3 354} 355 356define <4 x i32> @merge_4i32_i32_23u5(ptr %ptr) nounwind uwtable noinline ssp { 357; SSE-LABEL: merge_4i32_i32_23u5: 358; SSE: # %bb.0: 359; SSE-NEXT: movups 8(%rdi), %xmm0 360; SSE-NEXT: retq 361; 362; AVX-LABEL: merge_4i32_i32_23u5: 363; AVX: # %bb.0: 364; AVX-NEXT: vmovups 8(%rdi), %xmm0 365; AVX-NEXT: retq 366; 367; X86-SSE1-LABEL: merge_4i32_i32_23u5: 368; X86-SSE1: # %bb.0: 369; X86-SSE1-NEXT: pushl %esi 370; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 371; X86-SSE1-NEXT: .cfi_offset %esi, -8 372; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 373; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 374; X86-SSE1-NEXT: movl 8(%ecx), %edx 375; X86-SSE1-NEXT: movl 12(%ecx), %esi 376; X86-SSE1-NEXT: movl 20(%ecx), %ecx 377; X86-SSE1-NEXT: movl %esi, 4(%eax) 378; X86-SSE1-NEXT: movl %edx, (%eax) 379; X86-SSE1-NEXT: movl %ecx, 12(%eax) 380; X86-SSE1-NEXT: popl %esi 381; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 382; X86-SSE1-NEXT: retl $4 383; 384; X86-SSE41-LABEL: merge_4i32_i32_23u5: 385; X86-SSE41: # %bb.0: 386; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 387; X86-SSE41-NEXT: movups 8(%eax), %xmm0 388; X86-SSE41-NEXT: retl 389 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 2 390 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 3 391 %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 5 392 %val0 = load i32, ptr %ptr0 393 %val1 = load i32, ptr %ptr1 394 %val3 = load i32, ptr %ptr3 395 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0 396 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 397 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3 398 ret <4 x i32> %res3 399} 400 401define <4 x i32> @merge_4i32_i32_23u5_inc2(ptr %ptr) nounwind uwtable noinline ssp { 402; SSE-LABEL: merge_4i32_i32_23u5_inc2: 403; SSE: # %bb.0: 404; SSE-NEXT: movups 8(%rdi), %xmm0 405; SSE-NEXT: incl 8(%rdi) 406; SSE-NEXT: retq 407; 408; AVX-LABEL: merge_4i32_i32_23u5_inc2: 409; AVX: # %bb.0: 410; AVX-NEXT: vmovups 8(%rdi), %xmm0 411; AVX-NEXT: incl 8(%rdi) 412; AVX-NEXT: retq 413; 414; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc2: 415; X86-SSE1: # %bb.0: 416; X86-SSE1-NEXT: pushl %edi 417; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 418; X86-SSE1-NEXT: pushl %esi 419; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 420; X86-SSE1-NEXT: .cfi_offset %esi, -12 421; X86-SSE1-NEXT: .cfi_offset %edi, -8 422; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 423; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 424; X86-SSE1-NEXT: movl 8(%ecx), %edx 425; X86-SSE1-NEXT: movl 12(%ecx), %esi 426; X86-SSE1-NEXT: leal 1(%edx), %edi 427; X86-SSE1-NEXT: movl %edi, 8(%ecx) 428; X86-SSE1-NEXT: movl 20(%ecx), %ecx 429; X86-SSE1-NEXT: movl %esi, 4(%eax) 430; X86-SSE1-NEXT: movl %edx, (%eax) 431; X86-SSE1-NEXT: movl %ecx, 12(%eax) 432; X86-SSE1-NEXT: popl %esi 433; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 434; X86-SSE1-NEXT: popl %edi 435; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 436; X86-SSE1-NEXT: retl $4 437; 438; X86-SSE41-LABEL: merge_4i32_i32_23u5_inc2: 439; X86-SSE41: # %bb.0: 440; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 441; X86-SSE41-NEXT: movups 8(%eax), %xmm0 442; X86-SSE41-NEXT: incl 8(%eax) 443; X86-SSE41-NEXT: retl 444 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 2 445 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 3 446 %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 5 447 %val0 = load i32, ptr %ptr0 448 %inc = add i32 %val0, 1 449 store i32 %inc, ptr %ptr0 450 %val1 = load i32, ptr %ptr1 451 %val3 = load i32, ptr %ptr3 452 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0 453 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 454 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3 455 ret <4 x i32> %res3 456} 457 458define <4 x i32> @merge_4i32_i32_23u5_inc3(ptr %ptr) nounwind uwtable noinline ssp { 459; SSE-LABEL: merge_4i32_i32_23u5_inc3: 460; SSE: # %bb.0: 461; SSE-NEXT: movups 8(%rdi), %xmm0 462; SSE-NEXT: incl 12(%rdi) 463; SSE-NEXT: retq 464; 465; AVX-LABEL: merge_4i32_i32_23u5_inc3: 466; AVX: # %bb.0: 467; AVX-NEXT: vmovups 8(%rdi), %xmm0 468; AVX-NEXT: incl 12(%rdi) 469; AVX-NEXT: retq 470; 471; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc3: 472; X86-SSE1: # %bb.0: 473; X86-SSE1-NEXT: pushl %edi 474; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 475; X86-SSE1-NEXT: pushl %esi 476; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 477; X86-SSE1-NEXT: .cfi_offset %esi, -12 478; X86-SSE1-NEXT: .cfi_offset %edi, -8 479; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 480; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 481; X86-SSE1-NEXT: movl 8(%ecx), %edx 482; X86-SSE1-NEXT: movl 12(%ecx), %esi 483; X86-SSE1-NEXT: leal 1(%esi), %edi 484; X86-SSE1-NEXT: movl %edi, 12(%ecx) 485; X86-SSE1-NEXT: movl 20(%ecx), %ecx 486; X86-SSE1-NEXT: movl %esi, 4(%eax) 487; X86-SSE1-NEXT: movl %edx, (%eax) 488; X86-SSE1-NEXT: movl %ecx, 12(%eax) 489; X86-SSE1-NEXT: popl %esi 490; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 491; X86-SSE1-NEXT: popl %edi 492; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 493; X86-SSE1-NEXT: retl $4 494; 495; X86-SSE41-LABEL: merge_4i32_i32_23u5_inc3: 496; X86-SSE41: # %bb.0: 497; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 498; X86-SSE41-NEXT: movups 8(%eax), %xmm0 499; X86-SSE41-NEXT: incl 12(%eax) 500; X86-SSE41-NEXT: retl 501 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 2 502 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 3 503 %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 5 504 %val0 = load i32, ptr %ptr0 505 %val1 = load i32, ptr %ptr1 506 %inc = add i32 %val1, 1 507 store i32 %inc, ptr %ptr1 508 %val3 = load i32, ptr %ptr3 509 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0 510 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 511 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3 512 ret <4 x i32> %res3 513} 514 515define <4 x i32> @merge_4i32_i32_3zuu(ptr %ptr) nounwind uwtable noinline ssp { 516; SSE-LABEL: merge_4i32_i32_3zuu: 517; SSE: # %bb.0: 518; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 519; SSE-NEXT: retq 520; 521; AVX-LABEL: merge_4i32_i32_3zuu: 522; AVX: # %bb.0: 523; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 524; AVX-NEXT: retq 525; 526; X86-SSE1-LABEL: merge_4i32_i32_3zuu: 527; X86-SSE1: # %bb.0: 528; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 529; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 530; X86-SSE1-NEXT: movl 12(%ecx), %ecx 531; X86-SSE1-NEXT: movl %ecx, (%eax) 532; X86-SSE1-NEXT: movl $0, 4(%eax) 533; X86-SSE1-NEXT: retl $4 534; 535; X86-SSE41-LABEL: merge_4i32_i32_3zuu: 536; X86-SSE41: # %bb.0: 537; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 538; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 539; X86-SSE41-NEXT: retl 540 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 3 541 %val0 = load i32, ptr %ptr0 542 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0 543 %res1 = insertelement <4 x i32> %res0, i32 0, i32 1 544 ret <4 x i32> %res1 545} 546 547define <4 x i32> @merge_4i32_i32_34uu(ptr %ptr) nounwind uwtable noinline ssp { 548; SSE-LABEL: merge_4i32_i32_34uu: 549; SSE: # %bb.0: 550; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 551; SSE-NEXT: retq 552; 553; AVX-LABEL: merge_4i32_i32_34uu: 554; AVX: # %bb.0: 555; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 556; AVX-NEXT: retq 557; 558; X86-SSE1-LABEL: merge_4i32_i32_34uu: 559; X86-SSE1: # %bb.0: 560; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 561; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 562; X86-SSE1-NEXT: movl 12(%ecx), %edx 563; X86-SSE1-NEXT: movl 16(%ecx), %ecx 564; X86-SSE1-NEXT: movl %ecx, 4(%eax) 565; X86-SSE1-NEXT: movl %edx, (%eax) 566; X86-SSE1-NEXT: retl $4 567; 568; X86-SSE41-LABEL: merge_4i32_i32_34uu: 569; X86-SSE41: # %bb.0: 570; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 571; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 572; X86-SSE41-NEXT: retl 573 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 3 574 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 4 575 %val0 = load i32, ptr %ptr0 576 %val1 = load i32, ptr %ptr1 577 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0 578 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 579 ret <4 x i32> %res1 580} 581 582define <4 x i32> @merge_4i32_i32_45zz(ptr %ptr) nounwind uwtable noinline ssp { 583; SSE-LABEL: merge_4i32_i32_45zz: 584; SSE: # %bb.0: 585; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 586; SSE-NEXT: retq 587; 588; AVX-LABEL: merge_4i32_i32_45zz: 589; AVX: # %bb.0: 590; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 591; AVX-NEXT: retq 592; 593; X86-SSE1-LABEL: merge_4i32_i32_45zz: 594; X86-SSE1: # %bb.0: 595; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 596; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 597; X86-SSE1-NEXT: movl 16(%ecx), %edx 598; X86-SSE1-NEXT: movl 20(%ecx), %ecx 599; X86-SSE1-NEXT: movl %ecx, 4(%eax) 600; X86-SSE1-NEXT: movl %edx, (%eax) 601; X86-SSE1-NEXT: movl $0, 12(%eax) 602; X86-SSE1-NEXT: movl $0, 8(%eax) 603; X86-SSE1-NEXT: retl $4 604; 605; X86-SSE41-LABEL: merge_4i32_i32_45zz: 606; X86-SSE41: # %bb.0: 607; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 608; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 609; X86-SSE41-NEXT: retl 610 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 4 611 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 5 612 %val0 = load i32, ptr %ptr0 613 %val1 = load i32, ptr %ptr1 614 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0 615 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 616 ret <4 x i32> %res1 617} 618 619define <4 x i32> @merge_4i32_i32_45zz_inc4(ptr %ptr) nounwind uwtable noinline ssp { 620; SSE-LABEL: merge_4i32_i32_45zz_inc4: 621; SSE: # %bb.0: 622; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 623; SSE-NEXT: incl 16(%rdi) 624; SSE-NEXT: retq 625; 626; AVX-LABEL: merge_4i32_i32_45zz_inc4: 627; AVX: # %bb.0: 628; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 629; AVX-NEXT: incl 16(%rdi) 630; AVX-NEXT: retq 631; 632; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc4: 633; X86-SSE1: # %bb.0: 634; X86-SSE1-NEXT: pushl %edi 635; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 636; X86-SSE1-NEXT: pushl %esi 637; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 638; X86-SSE1-NEXT: .cfi_offset %esi, -12 639; X86-SSE1-NEXT: .cfi_offset %edi, -8 640; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 641; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 642; X86-SSE1-NEXT: movl 16(%ecx), %edx 643; X86-SSE1-NEXT: movl 20(%ecx), %esi 644; X86-SSE1-NEXT: leal 1(%edx), %edi 645; X86-SSE1-NEXT: movl %edi, 16(%ecx) 646; X86-SSE1-NEXT: movl %esi, 4(%eax) 647; X86-SSE1-NEXT: movl %edx, (%eax) 648; X86-SSE1-NEXT: movl $0, 12(%eax) 649; X86-SSE1-NEXT: movl $0, 8(%eax) 650; X86-SSE1-NEXT: popl %esi 651; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 652; X86-SSE1-NEXT: popl %edi 653; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 654; X86-SSE1-NEXT: retl $4 655; 656; X86-SSE41-LABEL: merge_4i32_i32_45zz_inc4: 657; X86-SSE41: # %bb.0: 658; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 659; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 660; X86-SSE41-NEXT: incl 16(%eax) 661; X86-SSE41-NEXT: retl 662 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 4 663 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 5 664 %val0 = load i32, ptr %ptr0 665 %inc = add i32 %val0, 1 666 store i32 %inc, ptr %ptr0 667 %val1 = load i32, ptr %ptr1 668 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0 669 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 670 ret <4 x i32> %res1 671} 672 673define <4 x i32> @merge_4i32_i32_45zz_inc5(ptr %ptr) nounwind uwtable noinline ssp { 674; SSE-LABEL: merge_4i32_i32_45zz_inc5: 675; SSE: # %bb.0: 676; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 677; SSE-NEXT: incl 20(%rdi) 678; SSE-NEXT: retq 679; 680; AVX-LABEL: merge_4i32_i32_45zz_inc5: 681; AVX: # %bb.0: 682; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 683; AVX-NEXT: incl 20(%rdi) 684; AVX-NEXT: retq 685; 686; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc5: 687; X86-SSE1: # %bb.0: 688; X86-SSE1-NEXT: pushl %edi 689; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 690; X86-SSE1-NEXT: pushl %esi 691; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 692; X86-SSE1-NEXT: .cfi_offset %esi, -12 693; X86-SSE1-NEXT: .cfi_offset %edi, -8 694; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 695; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 696; X86-SSE1-NEXT: movl 16(%ecx), %edx 697; X86-SSE1-NEXT: movl 20(%ecx), %esi 698; X86-SSE1-NEXT: leal 1(%esi), %edi 699; X86-SSE1-NEXT: movl %edi, 20(%ecx) 700; X86-SSE1-NEXT: movl %esi, 4(%eax) 701; X86-SSE1-NEXT: movl %edx, (%eax) 702; X86-SSE1-NEXT: movl $0, 12(%eax) 703; X86-SSE1-NEXT: movl $0, 8(%eax) 704; X86-SSE1-NEXT: popl %esi 705; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 706; X86-SSE1-NEXT: popl %edi 707; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 708; X86-SSE1-NEXT: retl $4 709; 710; X86-SSE41-LABEL: merge_4i32_i32_45zz_inc5: 711; X86-SSE41: # %bb.0: 712; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 713; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 714; X86-SSE41-NEXT: incl 20(%eax) 715; X86-SSE41-NEXT: retl 716 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 4 717 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 5 718 %val0 = load i32, ptr %ptr0 719 %val1 = load i32, ptr %ptr1 720 %inc = add i32 %val1, 1 721 store i32 %inc, ptr %ptr1 722 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0 723 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 724 ret <4 x i32> %res1 725} 726 727define <8 x i16> @merge_8i16_i16_23u567u9(ptr %ptr) nounwind uwtable noinline ssp { 728; SSE-LABEL: merge_8i16_i16_23u567u9: 729; SSE: # %bb.0: 730; SSE-NEXT: movups 4(%rdi), %xmm0 731; SSE-NEXT: retq 732; 733; AVX-LABEL: merge_8i16_i16_23u567u9: 734; AVX: # %bb.0: 735; AVX-NEXT: vmovups 4(%rdi), %xmm0 736; AVX-NEXT: retq 737; 738; X86-SSE1-LABEL: merge_8i16_i16_23u567u9: 739; X86-SSE1: # %bb.0: 740; X86-SSE1-NEXT: pushl %edi 741; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 742; X86-SSE1-NEXT: pushl %esi 743; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 744; X86-SSE1-NEXT: .cfi_offset %esi, -12 745; X86-SSE1-NEXT: .cfi_offset %edi, -8 746; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 747; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 748; X86-SSE1-NEXT: movl 4(%ecx), %edx 749; X86-SSE1-NEXT: movl 10(%ecx), %esi 750; X86-SSE1-NEXT: movzwl 14(%ecx), %edi 751; X86-SSE1-NEXT: movzwl 18(%ecx), %ecx 752; X86-SSE1-NEXT: movw %di, 10(%eax) 753; X86-SSE1-NEXT: movw %cx, 14(%eax) 754; X86-SSE1-NEXT: movl %esi, 6(%eax) 755; X86-SSE1-NEXT: movl %edx, (%eax) 756; X86-SSE1-NEXT: popl %esi 757; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 758; X86-SSE1-NEXT: popl %edi 759; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 760; X86-SSE1-NEXT: retl $4 761; 762; X86-SSE41-LABEL: merge_8i16_i16_23u567u9: 763; X86-SSE41: # %bb.0: 764; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 765; X86-SSE41-NEXT: movups 4(%eax), %xmm0 766; X86-SSE41-NEXT: retl 767 %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 2 768 %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 3 769 %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 5 770 %ptr4 = getelementptr inbounds i16, ptr %ptr, i64 6 771 %ptr5 = getelementptr inbounds i16, ptr %ptr, i64 7 772 %ptr7 = getelementptr inbounds i16, ptr %ptr, i64 9 773 %val0 = load i16, ptr %ptr0 774 %val1 = load i16, ptr %ptr1 775 %val3 = load i16, ptr %ptr3 776 %val4 = load i16, ptr %ptr4 777 %val5 = load i16, ptr %ptr5 778 %val7 = load i16, ptr %ptr7 779 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0 780 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1 781 %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3 782 %res4 = insertelement <8 x i16> %res3, i16 %val4, i32 4 783 %res5 = insertelement <8 x i16> %res4, i16 %val5, i32 5 784 %res7 = insertelement <8 x i16> %res5, i16 %val7, i32 7 785 ret <8 x i16> %res7 786} 787 788define <8 x i16> @merge_8i16_i16_34uuuuuu(ptr %ptr) nounwind uwtable noinline ssp { 789; SSE-LABEL: merge_8i16_i16_34uuuuuu: 790; SSE: # %bb.0: 791; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 792; SSE-NEXT: retq 793; 794; AVX-LABEL: merge_8i16_i16_34uuuuuu: 795; AVX: # %bb.0: 796; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 797; AVX-NEXT: retq 798; 799; X86-SSE1-LABEL: merge_8i16_i16_34uuuuuu: 800; X86-SSE1: # %bb.0: 801; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 802; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 803; X86-SSE1-NEXT: movl 6(%ecx), %ecx 804; X86-SSE1-NEXT: movl %ecx, (%eax) 805; X86-SSE1-NEXT: retl $4 806; 807; X86-SSE41-LABEL: merge_8i16_i16_34uuuuuu: 808; X86-SSE41: # %bb.0: 809; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 810; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 811; X86-SSE41-NEXT: retl 812 %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 3 813 %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 4 814 %val0 = load i16, ptr %ptr0 815 %val1 = load i16, ptr %ptr1 816 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0 817 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1 818 ret <8 x i16> %res1 819} 820 821define <8 x i16> @merge_8i16_i16_45u7zzzz(ptr %ptr) nounwind uwtable noinline ssp { 822; SSE-LABEL: merge_8i16_i16_45u7zzzz: 823; SSE: # %bb.0: 824; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 825; SSE-NEXT: retq 826; 827; AVX-LABEL: merge_8i16_i16_45u7zzzz: 828; AVX: # %bb.0: 829; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 830; AVX-NEXT: retq 831; 832; X86-SSE1-LABEL: merge_8i16_i16_45u7zzzz: 833; X86-SSE1: # %bb.0: 834; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 835; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 836; X86-SSE1-NEXT: movl 8(%ecx), %edx 837; X86-SSE1-NEXT: movzwl 14(%ecx), %ecx 838; X86-SSE1-NEXT: movw %cx, 6(%eax) 839; X86-SSE1-NEXT: movl %edx, (%eax) 840; X86-SSE1-NEXT: movl $0, 12(%eax) 841; X86-SSE1-NEXT: movl $0, 8(%eax) 842; X86-SSE1-NEXT: retl $4 843; 844; X86-SSE41-LABEL: merge_8i16_i16_45u7zzzz: 845; X86-SSE41: # %bb.0: 846; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 847; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 848; X86-SSE41-NEXT: retl 849 %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 4 850 %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 5 851 %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 7 852 %val0 = load i16, ptr %ptr0 853 %val1 = load i16, ptr %ptr1 854 %val3 = load i16, ptr %ptr3 855 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0 856 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1 857 %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3 858 %res4 = insertelement <8 x i16> %res3, i16 0, i32 4 859 %res5 = insertelement <8 x i16> %res4, i16 0, i32 5 860 %res6 = insertelement <8 x i16> %res5, i16 0, i32 6 861 %res7 = insertelement <8 x i16> %res6, i16 0, i32 7 862 ret <8 x i16> %res7 863} 864 865define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(ptr %ptr) nounwind uwtable noinline ssp { 866; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF: 867; SSE: # %bb.0: 868; SSE-NEXT: movups (%rdi), %xmm0 869; SSE-NEXT: retq 870; 871; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF: 872; AVX: # %bb.0: 873; AVX-NEXT: vmovups (%rdi), %xmm0 874; AVX-NEXT: retq 875; 876; X86-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF: 877; X86-SSE1: # %bb.0: 878; X86-SSE1-NEXT: pushl %ebp 879; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 880; X86-SSE1-NEXT: pushl %ebx 881; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 882; X86-SSE1-NEXT: pushl %edi 883; X86-SSE1-NEXT: .cfi_def_cfa_offset 16 884; X86-SSE1-NEXT: pushl %esi 885; X86-SSE1-NEXT: .cfi_def_cfa_offset 20 886; X86-SSE1-NEXT: .cfi_offset %esi, -20 887; X86-SSE1-NEXT: .cfi_offset %edi, -16 888; X86-SSE1-NEXT: .cfi_offset %ebx, -12 889; X86-SSE1-NEXT: .cfi_offset %ebp, -8 890; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 891; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 892; X86-SSE1-NEXT: movzwl (%ecx), %ebp 893; X86-SSE1-NEXT: movl 3(%ecx), %esi 894; X86-SSE1-NEXT: movl 7(%ecx), %edi 895; X86-SSE1-NEXT: movzwl 11(%ecx), %ebx 896; X86-SSE1-NEXT: movzbl 13(%ecx), %edx 897; X86-SSE1-NEXT: movzbl 15(%ecx), %ecx 898; X86-SSE1-NEXT: movb %dl, 13(%eax) 899; X86-SSE1-NEXT: movb %cl, 15(%eax) 900; X86-SSE1-NEXT: movw %bx, 11(%eax) 901; X86-SSE1-NEXT: movl %edi, 7(%eax) 902; X86-SSE1-NEXT: movl %esi, 3(%eax) 903; X86-SSE1-NEXT: movw %bp, (%eax) 904; X86-SSE1-NEXT: popl %esi 905; X86-SSE1-NEXT: .cfi_def_cfa_offset 16 906; X86-SSE1-NEXT: popl %edi 907; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 908; X86-SSE1-NEXT: popl %ebx 909; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 910; X86-SSE1-NEXT: popl %ebp 911; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 912; X86-SSE1-NEXT: retl $4 913; 914; X86-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF: 915; X86-SSE41: # %bb.0: 916; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 917; X86-SSE41-NEXT: movups (%eax), %xmm0 918; X86-SSE41-NEXT: retl 919 %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 1 920 %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 3 921 %ptr4 = getelementptr inbounds i8, ptr %ptr, i64 4 922 %ptr5 = getelementptr inbounds i8, ptr %ptr, i64 5 923 %ptr6 = getelementptr inbounds i8, ptr %ptr, i64 6 924 %ptr7 = getelementptr inbounds i8, ptr %ptr, i64 7 925 %ptr8 = getelementptr inbounds i8, ptr %ptr, i64 8 926 %ptr9 = getelementptr inbounds i8, ptr %ptr, i64 9 927 %ptrA = getelementptr inbounds i8, ptr %ptr, i64 10 928 %ptrB = getelementptr inbounds i8, ptr %ptr, i64 11 929 %ptrC = getelementptr inbounds i8, ptr %ptr, i64 12 930 %ptrD = getelementptr inbounds i8, ptr %ptr, i64 13 931 %ptrF = getelementptr inbounds i8, ptr %ptr, i64 15 932 %val0 = load i8, ptr %ptr 933 %val1 = load i8, ptr %ptr1 934 %val3 = load i8, ptr %ptr3 935 %val4 = load i8, ptr %ptr4 936 %val5 = load i8, ptr %ptr5 937 %val6 = load i8, ptr %ptr6 938 %val7 = load i8, ptr %ptr7 939 %val8 = load i8, ptr %ptr8 940 %val9 = load i8, ptr %ptr9 941 %valA = load i8, ptr %ptrA 942 %valB = load i8, ptr %ptrB 943 %valC = load i8, ptr %ptrC 944 %valD = load i8, ptr %ptrD 945 %valF = load i8, ptr %ptrF 946 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0 947 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1 948 %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3 949 %res4 = insertelement <16 x i8> %res3, i8 %val4, i32 4 950 %res5 = insertelement <16 x i8> %res4, i8 %val5, i32 5 951 %res6 = insertelement <16 x i8> %res5, i8 %val6, i32 6 952 %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7 953 %res8 = insertelement <16 x i8> %res7, i8 %val8, i32 8 954 %res9 = insertelement <16 x i8> %res8, i8 %val9, i32 9 955 %resA = insertelement <16 x i8> %res9, i8 %valA, i32 10 956 %resB = insertelement <16 x i8> %resA, i8 %valB, i32 11 957 %resC = insertelement <16 x i8> %resB, i8 %valC, i32 12 958 %resD = insertelement <16 x i8> %resC, i8 %valD, i32 13 959 %resF = insertelement <16 x i8> %resD, i8 %valF, i32 15 960 ret <16 x i8> %resF 961} 962 963define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(ptr %ptr) nounwind uwtable noinline ssp { 964; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: 965; SSE: # %bb.0: 966; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 967; SSE-NEXT: retq 968; 969; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: 970; AVX: # %bb.0: 971; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 972; AVX-NEXT: retq 973; 974; X86-SSE1-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: 975; X86-SSE1: # %bb.0: 976; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 977; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 978; X86-SSE1-NEXT: movzwl (%ecx), %edx 979; X86-SSE1-NEXT: movzbl 3(%ecx), %ecx 980; X86-SSE1-NEXT: movb %cl, 3(%eax) 981; X86-SSE1-NEXT: movw %dx, (%eax) 982; X86-SSE1-NEXT: movb $0, 15(%eax) 983; X86-SSE1-NEXT: movw $0, 13(%eax) 984; X86-SSE1-NEXT: movw $0, 6(%eax) 985; X86-SSE1-NEXT: retl $4 986; 987; X86-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: 988; X86-SSE41: # %bb.0: 989; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 990; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 991; X86-SSE41-NEXT: retl 992 %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 1 993 %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 3 994 %val0 = load i8, ptr %ptr 995 %val1 = load i8, ptr %ptr1 996 %val3 = load i8, ptr %ptr3 997 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0 998 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1 999 %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3 1000 %res6 = insertelement <16 x i8> %res3, i8 0, i32 6 1001 %res7 = insertelement <16 x i8> %res6, i8 0, i32 7 1002 %resD = insertelement <16 x i8> %res7, i8 0, i32 13 1003 %resE = insertelement <16 x i8> %resD, i8 0, i32 14 1004 %resF = insertelement <16 x i8> %resE, i8 0, i32 15 1005 ret <16 x i8> %resF 1006} 1007 1008define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(ptr %ptr) nounwind uwtable noinline ssp { 1009; SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: 1010; SSE: # %bb.0: 1011; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1012; SSE-NEXT: retq 1013; 1014; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: 1015; AVX: # %bb.0: 1016; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1017; AVX-NEXT: retq 1018; 1019; X86-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: 1020; X86-SSE1: # %bb.0: 1021; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 1022; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 1023; X86-SSE1-NEXT: movl (%ecx), %edx 1024; X86-SSE1-NEXT: movzwl 6(%ecx), %ecx 1025; X86-SSE1-NEXT: movw %cx, 6(%eax) 1026; X86-SSE1-NEXT: movl %edx, (%eax) 1027; X86-SSE1-NEXT: movb $0, 15(%eax) 1028; X86-SSE1-NEXT: movw $0, 13(%eax) 1029; X86-SSE1-NEXT: retl $4 1030; 1031; X86-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: 1032; X86-SSE41: # %bb.0: 1033; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 1034; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1035; X86-SSE41-NEXT: retl 1036 %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 1 1037 %ptr2 = getelementptr inbounds i8, ptr %ptr, i64 2 1038 %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 3 1039 %ptr6 = getelementptr inbounds i8, ptr %ptr, i64 6 1040 %ptr7 = getelementptr inbounds i8, ptr %ptr, i64 7 1041 %val0 = load i8, ptr %ptr 1042 %val1 = load i8, ptr %ptr1 1043 %val2 = load i8, ptr %ptr2 1044 %val3 = load i8, ptr %ptr3 1045 %val6 = load i8, ptr %ptr6 1046 %val7 = load i8, ptr %ptr7 1047 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0 1048 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1 1049 %res2 = insertelement <16 x i8> %res1, i8 %val2, i32 2 1050 %res3 = insertelement <16 x i8> %res2, i8 %val3, i32 3 1051 %res6 = insertelement <16 x i8> %res3, i8 %val6, i32 6 1052 %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7 1053 %resD = insertelement <16 x i8> %res7, i8 0, i32 13 1054 %resE = insertelement <16 x i8> %resD, i8 0, i32 14 1055 %resF = insertelement <16 x i8> %resE, i8 0, i32 15 1056 ret <16 x i8> %resF 1057} 1058 1059define void @merge_4i32_i32_combine(ptr %dst, ptr %src) { 1060; SSE-LABEL: merge_4i32_i32_combine: 1061; SSE: # %bb.0: 1062; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1063; SSE-NEXT: movaps %xmm0, (%rdi) 1064; SSE-NEXT: retq 1065; 1066; AVX-LABEL: merge_4i32_i32_combine: 1067; AVX: # %bb.0: 1068; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1069; AVX-NEXT: vmovaps %xmm0, (%rdi) 1070; AVX-NEXT: retq 1071; 1072; X86-SSE1-LABEL: merge_4i32_i32_combine: 1073; X86-SSE1: # %bb.0: 1074; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 1075; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 1076; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1077; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] 1078; X86-SSE1-NEXT: andps %xmm0, %xmm1 1079; X86-SSE1-NEXT: movaps %xmm1, (%eax) 1080; X86-SSE1-NEXT: retl 1081; 1082; X86-SSE41-LABEL: merge_4i32_i32_combine: 1083; X86-SSE41: # %bb.0: 1084; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 1085; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx 1086; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1087; X86-SSE41-NEXT: movaps %xmm0, (%eax) 1088; X86-SSE41-NEXT: retl 1089 %1 = load i32, ptr %src 1090 %2 = insertelement <4 x i32> undef, i32 %1, i32 0 1091 %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer 1092 %4 = lshr <4 x i32> %3, <i32 0, i32 undef, i32 undef, i32 undef> 1093 %5 = and <4 x i32> %4, <i32 -1, i32 0, i32 0, i32 0> 1094 store <4 x i32> %5, ptr %dst 1095 ret void 1096} 1097 1098; 1099; consecutive loads including any/all volatiles may not be combined 1100; 1101 1102define <2 x i64> @merge_2i64_i64_12_volatile(ptr %ptr) nounwind uwtable noinline ssp { 1103; SSE-LABEL: merge_2i64_i64_12_volatile: 1104; SSE: # %bb.0: 1105; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1106; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 1107; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1108; SSE-NEXT: retq 1109; 1110; AVX-LABEL: merge_2i64_i64_12_volatile: 1111; AVX: # %bb.0: 1112; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1113; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 1114; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1115; AVX-NEXT: retq 1116; 1117; X86-SSE1-LABEL: merge_2i64_i64_12_volatile: 1118; X86-SSE1: # %bb.0: 1119; X86-SSE1-NEXT: pushl %edi 1120; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 1121; X86-SSE1-NEXT: pushl %esi 1122; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 1123; X86-SSE1-NEXT: .cfi_offset %esi, -12 1124; X86-SSE1-NEXT: .cfi_offset %edi, -8 1125; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 1126; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 1127; X86-SSE1-NEXT: movl 8(%ecx), %edx 1128; X86-SSE1-NEXT: movl 12(%ecx), %esi 1129; X86-SSE1-NEXT: movl 16(%ecx), %edi 1130; X86-SSE1-NEXT: movl 20(%ecx), %ecx 1131; X86-SSE1-NEXT: movl %ecx, 12(%eax) 1132; X86-SSE1-NEXT: movl %edi, 8(%eax) 1133; X86-SSE1-NEXT: movl %esi, 4(%eax) 1134; X86-SSE1-NEXT: movl %edx, (%eax) 1135; X86-SSE1-NEXT: popl %esi 1136; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 1137; X86-SSE1-NEXT: popl %edi 1138; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 1139; X86-SSE1-NEXT: retl $4 1140; 1141; X86-SSE41-LABEL: merge_2i64_i64_12_volatile: 1142; X86-SSE41: # %bb.0: 1143; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 1144; X86-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1145; X86-SSE41-NEXT: pinsrd $1, 12(%eax), %xmm0 1146; X86-SSE41-NEXT: pinsrd $2, 16(%eax), %xmm0 1147; X86-SSE41-NEXT: pinsrd $3, 20(%eax), %xmm0 1148; X86-SSE41-NEXT: retl 1149 %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 1 1150 %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 2 1151 %val0 = load volatile i64, ptr %ptr0 1152 %val1 = load volatile i64, ptr %ptr1 1153 %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0 1154 %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1 1155 ret <2 x i64> %res1 1156} 1157 1158define <4 x float> @merge_4f32_f32_2345_volatile(ptr %ptr) nounwind uwtable noinline ssp { 1159; SSE2-LABEL: merge_4f32_f32_2345_volatile: 1160; SSE2: # %bb.0: 1161; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1162; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1163; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1164; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 1165; SSE2-NEXT: retq 1166; 1167; SSE41-LABEL: merge_4f32_f32_2345_volatile: 1168; SSE41: # %bb.0: 1169; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1170; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 1171; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 1172; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 1173; SSE41-NEXT: retq 1174; 1175; AVX-LABEL: merge_4f32_f32_2345_volatile: 1176; AVX: # %bb.0: 1177; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1178; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 1179; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 1180; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 1181; AVX-NEXT: retq 1182; 1183; X86-SSE1-LABEL: merge_4f32_f32_2345_volatile: 1184; X86-SSE1: # %bb.0: 1185; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 1186; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1187; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1188; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1189; X86-SSE1-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 1190; X86-SSE1-NEXT: retl 1191; 1192; X86-SSE41-LABEL: merge_4f32_f32_2345_volatile: 1193; X86-SSE41: # %bb.0: 1194; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 1195; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1196; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 1197; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 1198; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 1199; X86-SSE41-NEXT: retl 1200 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 2 1201 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 3 1202 %ptr2 = getelementptr inbounds float, ptr %ptr, i64 4 1203 %ptr3 = getelementptr inbounds float, ptr %ptr, i64 5 1204 %val0 = load volatile float, ptr %ptr0 1205 %val1 = load float, ptr %ptr1 1206 %val2 = load float, ptr %ptr2 1207 %val3 = load float, ptr %ptr3 1208 %res0 = insertelement <4 x float> undef, float %val0, i32 0 1209 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 1210 %res2 = insertelement <4 x float> %res1, float %val2, i32 2 1211 %res3 = insertelement <4 x float> %res2, float %val3, i32 3 1212 ret <4 x float> %res3 1213} 1214 1215; 1216; Non-consecutive test. 1217; 1218 1219define <4 x float> @merge_4f32_f32_X0YY(ptr %ptr0, ptr %ptr1) nounwind uwtable noinline ssp { 1220; SSE-LABEL: merge_4f32_f32_X0YY: 1221; SSE: # %bb.0: 1222; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1223; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1224; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] 1225; SSE-NEXT: retq 1226; 1227; AVX-LABEL: merge_4f32_f32_X0YY: 1228; AVX: # %bb.0: 1229; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1230; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1231; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] 1232; AVX-NEXT: retq 1233; 1234; X86-SSE-LABEL: merge_4f32_f32_X0YY: 1235; X86-SSE: # %bb.0: 1236; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1237; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1238; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1239; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1240; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] 1241; X86-SSE-NEXT: retl 1242 %val0 = load float, ptr %ptr0, align 4 1243 %val1 = load float, ptr %ptr1, align 4 1244 %res0 = insertelement <4 x float> undef, float %val0, i32 0 1245 %res1 = insertelement <4 x float> %res0, float 0.000000e+00, i32 1 1246 %res2 = insertelement <4 x float> %res1, float %val1, i32 2 1247 %res3 = insertelement <4 x float> %res2, float %val1, i32 3 1248 ret <4 x float> %res3 1249} 1250 1251; 1252; Extension tests. 1253; 1254 1255; PR31309 1256define <4 x i32> @load_i32_zext_i128_v4i32(ptr %ptr) { 1257; SSE-LABEL: load_i32_zext_i128_v4i32: 1258; SSE: # %bb.0: 1259; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1260; SSE-NEXT: retq 1261; 1262; AVX-LABEL: load_i32_zext_i128_v4i32: 1263; AVX: # %bb.0: 1264; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1265; AVX-NEXT: retq 1266; 1267; X86-SSE1-LABEL: load_i32_zext_i128_v4i32: 1268; X86-SSE1: # %bb.0: 1269; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 1270; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 1271; X86-SSE1-NEXT: movl (%ecx), %ecx 1272; X86-SSE1-NEXT: movl %ecx, (%eax) 1273; X86-SSE1-NEXT: movl $0, 12(%eax) 1274; X86-SSE1-NEXT: movl $0, 8(%eax) 1275; X86-SSE1-NEXT: movl $0, 4(%eax) 1276; X86-SSE1-NEXT: retl $4 1277; 1278; X86-SSE41-LABEL: load_i32_zext_i128_v4i32: 1279; X86-SSE41: # %bb.0: 1280; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 1281; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1282; X86-SSE41-NEXT: retl 1283 %1 = load i32, ptr %ptr 1284 %2 = zext i32 %1 to i128 1285 %3 = bitcast i128 %2 to <4 x i32> 1286 ret <4 x i32> %3 1287} 1288