1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2 3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=X86,X86-SSE4A 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64-SSE,X64-SSE2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=X64-SSE,X64-SSE4A 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X64-SSE,X64-SSE41 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2 9 10; 11; PR42123 12; 13 14define void @merge_2_v4f32_align32(ptr %a0, ptr %a1) nounwind { 15; X86-LABEL: merge_2_v4f32_align32: 16; X86: # %bb.0: 17; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 18; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 19; X86-NEXT: movaps (%ecx), %xmm0 20; X86-NEXT: movaps 16(%ecx), %xmm1 21; X86-NEXT: movntps %xmm0, (%eax) 22; X86-NEXT: movntps %xmm1, 16(%eax) 23; X86-NEXT: retl 24; 25; X64-SSE2-LABEL: merge_2_v4f32_align32: 26; X64-SSE2: # %bb.0: 27; X64-SSE2-NEXT: movaps (%rdi), %xmm0 28; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1 29; X64-SSE2-NEXT: movntps %xmm0, (%rsi) 30; X64-SSE2-NEXT: movntps %xmm1, 16(%rsi) 31; X64-SSE2-NEXT: retq 32; 33; X64-SSE4A-LABEL: merge_2_v4f32_align32: 34; X64-SSE4A: # %bb.0: 35; X64-SSE4A-NEXT: movaps (%rdi), %xmm0 36; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1 37; X64-SSE4A-NEXT: movntps %xmm0, (%rsi) 38; X64-SSE4A-NEXT: movntps %xmm1, 16(%rsi) 39; X64-SSE4A-NEXT: retq 40; 41; X64-SSE41-LABEL: merge_2_v4f32_align32: 42; X64-SSE41: # %bb.0: 43; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0 44; X64-SSE41-NEXT: movntdqa 16(%rdi), %xmm1 45; X64-SSE41-NEXT: movntdq %xmm0, (%rsi) 46; X64-SSE41-NEXT: movntdq %xmm1, 16(%rsi) 47; X64-SSE41-NEXT: retq 48; 49; X64-AVX1-LABEL: merge_2_v4f32_align32: 50; X64-AVX1: # %bb.0: 51; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm0 52; X64-AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 53; X64-AVX1-NEXT: vmovntdq %xmm0, (%rsi) 54; X64-AVX1-NEXT: vmovntdq %xmm1, 16(%rsi) 55; X64-AVX1-NEXT: retq 56; 57; X64-AVX2-LABEL: merge_2_v4f32_align32: 58; X64-AVX2: # %bb.0: 59; X64-AVX2-NEXT: vmovntdqa (%rdi), %ymm0 60; X64-AVX2-NEXT: vmovntdq %ymm0, (%rsi) 61; X64-AVX2-NEXT: vzeroupper 62; X64-AVX2-NEXT: retq 63 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0 64 %2 = load <4 x float>, ptr %a0, align 32, !nontemporal !0 65 %3 = load <4 x float>, ptr %1, align 16, !nontemporal !0 66 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0 67 store <4 x float> %2, ptr %a1, align 32, !nontemporal !0 68 store <4 x float> %3, ptr %4, align 16, !nontemporal !0 69 ret void 70} 71 72; Don't merge nt and non-nt loads even if aligned. 73define void @merge_2_v4f32_align32_mix_ntload(ptr %a0, ptr %a1) nounwind { 74; X86-LABEL: merge_2_v4f32_align32_mix_ntload: 75; X86: # %bb.0: 76; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 77; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 78; X86-NEXT: movaps (%ecx), %xmm0 79; X86-NEXT: movaps 16(%ecx), %xmm1 80; X86-NEXT: movaps %xmm0, (%eax) 81; X86-NEXT: movaps %xmm1, 16(%eax) 82; X86-NEXT: retl 83; 84; X64-SSE2-LABEL: merge_2_v4f32_align32_mix_ntload: 85; X64-SSE2: # %bb.0: 86; X64-SSE2-NEXT: movaps (%rdi), %xmm0 87; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1 88; X64-SSE2-NEXT: movaps %xmm0, (%rsi) 89; X64-SSE2-NEXT: movaps %xmm1, 16(%rsi) 90; X64-SSE2-NEXT: retq 91; 92; X64-SSE4A-LABEL: merge_2_v4f32_align32_mix_ntload: 93; X64-SSE4A: # %bb.0: 94; X64-SSE4A-NEXT: movaps (%rdi), %xmm0 95; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1 96; X64-SSE4A-NEXT: movaps %xmm0, (%rsi) 97; X64-SSE4A-NEXT: movaps %xmm1, 16(%rsi) 98; X64-SSE4A-NEXT: retq 99; 100; X64-SSE41-LABEL: merge_2_v4f32_align32_mix_ntload: 101; X64-SSE41: # %bb.0: 102; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0 103; X64-SSE41-NEXT: movaps 16(%rdi), %xmm1 104; X64-SSE41-NEXT: movdqa %xmm0, (%rsi) 105; X64-SSE41-NEXT: movaps %xmm1, 16(%rsi) 106; X64-SSE41-NEXT: retq 107; 108; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntload: 109; X64-AVX: # %bb.0: 110; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0 111; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 112; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi) 113; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi) 114; X64-AVX-NEXT: retq 115 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0 116 %2 = load <4 x float>, ptr %a0, align 32, !nontemporal !0 117 %3 = load <4 x float>, ptr %1, align 16 118 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0 119 store <4 x float> %2, ptr %a1, align 32 120 store <4 x float> %3, ptr %4, align 16 121 ret void 122} 123 124; Don't merge nt and non-nt stores even if aligned. 125define void @merge_2_v4f32_align32_mix_ntstore(ptr %a0, ptr %a1) nounwind { 126; X86-LABEL: merge_2_v4f32_align32_mix_ntstore: 127; X86: # %bb.0: 128; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 129; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 130; X86-NEXT: movaps (%ecx), %xmm0 131; X86-NEXT: movaps 16(%ecx), %xmm1 132; X86-NEXT: movntps %xmm0, (%eax) 133; X86-NEXT: movaps %xmm1, 16(%eax) 134; X86-NEXT: retl 135; 136; X64-SSE-LABEL: merge_2_v4f32_align32_mix_ntstore: 137; X64-SSE: # %bb.0: 138; X64-SSE-NEXT: movaps (%rdi), %xmm0 139; X64-SSE-NEXT: movaps 16(%rdi), %xmm1 140; X64-SSE-NEXT: movntps %xmm0, (%rsi) 141; X64-SSE-NEXT: movaps %xmm1, 16(%rsi) 142; X64-SSE-NEXT: retq 143; 144; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntstore: 145; X64-AVX: # %bb.0: 146; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 147; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 148; X64-AVX-NEXT: vmovntps %xmm0, (%rsi) 149; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi) 150; X64-AVX-NEXT: retq 151 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0 152 %2 = load <4 x float>, ptr %a0, align 32 153 %3 = load <4 x float>, ptr %1, align 16 154 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0 155 store <4 x float> %2, ptr %a1, align 32, !nontemporal !0 156 store <4 x float> %3, ptr %4, align 16 157 ret void 158} 159 160; AVX2 can't perform NT-load-ymm on 16-byte aligned memory. 161; Must be kept separate as VMOVNTDQA xmm. 162define void @merge_2_v4f32_align16_ntload(ptr %a0, ptr %a1) nounwind { 163; X86-LABEL: merge_2_v4f32_align16_ntload: 164; X86: # %bb.0: 165; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 166; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 167; X86-NEXT: movaps (%ecx), %xmm0 168; X86-NEXT: movaps 16(%ecx), %xmm1 169; X86-NEXT: movaps %xmm0, (%eax) 170; X86-NEXT: movaps %xmm1, 16(%eax) 171; X86-NEXT: retl 172; 173; X64-SSE2-LABEL: merge_2_v4f32_align16_ntload: 174; X64-SSE2: # %bb.0: 175; X64-SSE2-NEXT: movaps (%rdi), %xmm0 176; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1 177; X64-SSE2-NEXT: movaps %xmm0, (%rsi) 178; X64-SSE2-NEXT: movaps %xmm1, 16(%rsi) 179; X64-SSE2-NEXT: retq 180; 181; X64-SSE4A-LABEL: merge_2_v4f32_align16_ntload: 182; X64-SSE4A: # %bb.0: 183; X64-SSE4A-NEXT: movaps (%rdi), %xmm0 184; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1 185; X64-SSE4A-NEXT: movaps %xmm0, (%rsi) 186; X64-SSE4A-NEXT: movaps %xmm1, 16(%rsi) 187; X64-SSE4A-NEXT: retq 188; 189; X64-SSE41-LABEL: merge_2_v4f32_align16_ntload: 190; X64-SSE41: # %bb.0: 191; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0 192; X64-SSE41-NEXT: movntdqa 16(%rdi), %xmm1 193; X64-SSE41-NEXT: movdqa %xmm0, (%rsi) 194; X64-SSE41-NEXT: movdqa %xmm1, 16(%rsi) 195; X64-SSE41-NEXT: retq 196; 197; X64-AVX-LABEL: merge_2_v4f32_align16_ntload: 198; X64-AVX: # %bb.0: 199; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0 200; X64-AVX-NEXT: vmovntdqa 16(%rdi), %xmm1 201; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi) 202; X64-AVX-NEXT: vmovdqa %xmm1, 16(%rsi) 203; X64-AVX-NEXT: retq 204 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0 205 %2 = load <4 x float>, ptr %a0, align 16, !nontemporal !0 206 %3 = load <4 x float>, ptr %1, align 16, !nontemporal !0 207 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0 208 store <4 x float> %2, ptr %a1, align 16 209 store <4 x float> %3, ptr %4, align 16 210 ret void 211} 212 213; AVX can't perform NT-store-ymm on 16-byte aligned memory. 214; Must be kept separate as VMOVNTPS xmm. 215define void @merge_2_v4f32_align16_ntstore(ptr %a0, ptr %a1) nounwind { 216; X86-LABEL: merge_2_v4f32_align16_ntstore: 217; X86: # %bb.0: 218; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 219; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 220; X86-NEXT: movaps (%ecx), %xmm0 221; X86-NEXT: movaps 16(%ecx), %xmm1 222; X86-NEXT: movntps %xmm0, (%eax) 223; X86-NEXT: movntps %xmm1, 16(%eax) 224; X86-NEXT: retl 225; 226; X64-SSE-LABEL: merge_2_v4f32_align16_ntstore: 227; X64-SSE: # %bb.0: 228; X64-SSE-NEXT: movaps (%rdi), %xmm0 229; X64-SSE-NEXT: movaps 16(%rdi), %xmm1 230; X64-SSE-NEXT: movntps %xmm0, (%rsi) 231; X64-SSE-NEXT: movntps %xmm1, 16(%rsi) 232; X64-SSE-NEXT: retq 233; 234; X64-AVX-LABEL: merge_2_v4f32_align16_ntstore: 235; X64-AVX: # %bb.0: 236; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 237; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 238; X64-AVX-NEXT: vmovntps %xmm0, (%rsi) 239; X64-AVX-NEXT: vmovntps %xmm1, 16(%rsi) 240; X64-AVX-NEXT: retq 241 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0 242 %2 = load <4 x float>, ptr %a0, align 16 243 %3 = load <4 x float>, ptr %1, align 16 244 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0 245 store <4 x float> %2, ptr %a1, align 16, !nontemporal !0 246 store <4 x float> %3, ptr %4, align 16, !nontemporal !0 247 ret void 248} 249 250; Nothing can perform NT-load-vector on 1-byte aligned memory. 251; Just perform regular loads. 252define void @merge_2_v4f32_align1_ntload(ptr %a0, ptr %a1) nounwind { 253; X86-LABEL: merge_2_v4f32_align1_ntload: 254; X86: # %bb.0: 255; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 256; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 257; X86-NEXT: movups (%ecx), %xmm0 258; X86-NEXT: movups 16(%ecx), %xmm1 259; X86-NEXT: movups %xmm0, (%eax) 260; X86-NEXT: movups %xmm1, 16(%eax) 261; X86-NEXT: retl 262; 263; X64-SSE-LABEL: merge_2_v4f32_align1_ntload: 264; X64-SSE: # %bb.0: 265; X64-SSE-NEXT: movups (%rdi), %xmm0 266; X64-SSE-NEXT: movups 16(%rdi), %xmm1 267; X64-SSE-NEXT: movups %xmm0, (%rsi) 268; X64-SSE-NEXT: movups %xmm1, 16(%rsi) 269; X64-SSE-NEXT: retq 270; 271; X64-AVX-LABEL: merge_2_v4f32_align1_ntload: 272; X64-AVX: # %bb.0: 273; X64-AVX-NEXT: vmovups (%rdi), %ymm0 274; X64-AVX-NEXT: vmovups %ymm0, (%rsi) 275; X64-AVX-NEXT: vzeroupper 276; X64-AVX-NEXT: retq 277 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0 278 %2 = load <4 x float>, ptr %a0, align 1, !nontemporal !0 279 %3 = load <4 x float>, ptr %1, align 1, !nontemporal !0 280 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0 281 store <4 x float> %2, ptr %a1, align 1 282 store <4 x float> %3, ptr %4, align 1 283 ret void 284} 285 286; Nothing can perform NT-store-vector on 1-byte aligned memory. 287; Must be scalarized to use MOVTNI/MOVNTSD. 288define void @merge_2_v4f32_align1_ntstore(ptr %a0, ptr %a1) nounwind { 289; X86-SSE2-LABEL: merge_2_v4f32_align1_ntstore: 290; X86-SSE2: # %bb.0: 291; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 292; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx 293; X86-SSE2-NEXT: movdqu (%ecx), %xmm1 294; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 295; X86-SSE2-NEXT: movd %xmm1, %ecx 296; X86-SSE2-NEXT: movntil %ecx, (%eax) 297; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 298; X86-SSE2-NEXT: movd %xmm2, %ecx 299; X86-SSE2-NEXT: movntil %ecx, 12(%eax) 300; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 301; X86-SSE2-NEXT: movd %xmm2, %ecx 302; X86-SSE2-NEXT: movntil %ecx, 8(%eax) 303; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 304; X86-SSE2-NEXT: movd %xmm1, %ecx 305; X86-SSE2-NEXT: movntil %ecx, 4(%eax) 306; X86-SSE2-NEXT: movd %xmm0, %ecx 307; X86-SSE2-NEXT: movntil %ecx, 16(%eax) 308; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 309; X86-SSE2-NEXT: movd %xmm1, %ecx 310; X86-SSE2-NEXT: movntil %ecx, 28(%eax) 311; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 312; X86-SSE2-NEXT: movd %xmm1, %ecx 313; X86-SSE2-NEXT: movntil %ecx, 24(%eax) 314; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 315; X86-SSE2-NEXT: movd %xmm0, %ecx 316; X86-SSE2-NEXT: movntil %ecx, 20(%eax) 317; X86-SSE2-NEXT: retl 318; 319; X86-SSE4A-LABEL: merge_2_v4f32_align1_ntstore: 320; X86-SSE4A: # %bb.0: 321; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax 322; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx 323; X86-SSE4A-NEXT: movsd (%ecx), %xmm0 # xmm0 = mem[0],zero 324; X86-SSE4A-NEXT: movsd 8(%ecx), %xmm1 # xmm1 = mem[0],zero 325; X86-SSE4A-NEXT: movsd 16(%ecx), %xmm2 # xmm2 = mem[0],zero 326; X86-SSE4A-NEXT: movsd 24(%ecx), %xmm3 # xmm3 = mem[0],zero 327; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) 328; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax) 329; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) 330; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax) 331; X86-SSE4A-NEXT: retl 332; 333; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore: 334; X64-SSE2: # %bb.0: 335; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 336; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 337; X64-SSE2-NEXT: movq %xmm0, %rax 338; X64-SSE2-NEXT: movntiq %rax, (%rsi) 339; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 340; X64-SSE2-NEXT: movq %xmm0, %rax 341; X64-SSE2-NEXT: movntiq %rax, 8(%rsi) 342; X64-SSE2-NEXT: movq %xmm1, %rax 343; X64-SSE2-NEXT: movntiq %rax, 16(%rsi) 344; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 345; X64-SSE2-NEXT: movq %xmm0, %rax 346; X64-SSE2-NEXT: movntiq %rax, 24(%rsi) 347; X64-SSE2-NEXT: retq 348; 349; X64-SSE4A-LABEL: merge_2_v4f32_align1_ntstore: 350; X64-SSE4A: # %bb.0: 351; X64-SSE4A-NEXT: movsd (%rdi), %xmm0 # xmm0 = mem[0],zero 352; X64-SSE4A-NEXT: movsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero 353; X64-SSE4A-NEXT: movsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero 354; X64-SSE4A-NEXT: movsd 24(%rdi), %xmm3 # xmm3 = mem[0],zero 355; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) 356; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi) 357; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) 358; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi) 359; X64-SSE4A-NEXT: retq 360; 361; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore: 362; X64-SSE41: # %bb.0: 363; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 364; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1 365; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax 366; X64-SSE41-NEXT: movntiq %rax, 8(%rsi) 367; X64-SSE41-NEXT: movq %xmm0, %rax 368; X64-SSE41-NEXT: movntiq %rax, (%rsi) 369; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax 370; X64-SSE41-NEXT: movntiq %rax, 24(%rsi) 371; X64-SSE41-NEXT: movq %xmm1, %rax 372; X64-SSE41-NEXT: movntiq %rax, 16(%rsi) 373; X64-SSE41-NEXT: retq 374; 375; X64-AVX-LABEL: merge_2_v4f32_align1_ntstore: 376; X64-AVX: # %bb.0: 377; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 378; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1 379; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax 380; X64-AVX-NEXT: movntiq %rax, 8(%rsi) 381; X64-AVX-NEXT: vmovq %xmm0, %rax 382; X64-AVX-NEXT: movntiq %rax, (%rsi) 383; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax 384; X64-AVX-NEXT: movntiq %rax, 24(%rsi) 385; X64-AVX-NEXT: vmovq %xmm1, %rax 386; X64-AVX-NEXT: movntiq %rax, 16(%rsi) 387; X64-AVX-NEXT: retq 388 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0 389 %2 = load <4 x float>, ptr %a0, align 1 390 %3 = load <4 x float>, ptr %1, align 1 391 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0 392 store <4 x float> %2, ptr %a1, align 1, !nontemporal !0 393 store <4 x float> %3, ptr %4, align 1, !nontemporal !0 394 ret void 395} 396 397; Nothing can perform NT-load-vector on 1-byte aligned memory. 398; Just perform regular loads and scalarize NT-stores. 399define void @merge_2_v4f32_align1(ptr %a0, ptr %a1) nounwind { 400; X86-SSE2-LABEL: merge_2_v4f32_align1: 401; X86-SSE2: # %bb.0: 402; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 403; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx 404; X86-SSE2-NEXT: movdqu (%ecx), %xmm1 405; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 406; X86-SSE2-NEXT: movd %xmm1, %ecx 407; X86-SSE2-NEXT: movntil %ecx, (%eax) 408; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 409; X86-SSE2-NEXT: movd %xmm2, %ecx 410; X86-SSE2-NEXT: movntil %ecx, 12(%eax) 411; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 412; X86-SSE2-NEXT: movd %xmm2, %ecx 413; X86-SSE2-NEXT: movntil %ecx, 8(%eax) 414; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 415; X86-SSE2-NEXT: movd %xmm1, %ecx 416; X86-SSE2-NEXT: movntil %ecx, 4(%eax) 417; X86-SSE2-NEXT: movd %xmm0, %ecx 418; X86-SSE2-NEXT: movntil %ecx, 16(%eax) 419; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 420; X86-SSE2-NEXT: movd %xmm1, %ecx 421; X86-SSE2-NEXT: movntil %ecx, 28(%eax) 422; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 423; X86-SSE2-NEXT: movd %xmm1, %ecx 424; X86-SSE2-NEXT: movntil %ecx, 24(%eax) 425; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 426; X86-SSE2-NEXT: movd %xmm0, %ecx 427; X86-SSE2-NEXT: movntil %ecx, 20(%eax) 428; X86-SSE2-NEXT: retl 429; 430; X86-SSE4A-LABEL: merge_2_v4f32_align1: 431; X86-SSE4A: # %bb.0: 432; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax 433; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx 434; X86-SSE4A-NEXT: movsd (%ecx), %xmm0 # xmm0 = mem[0],zero 435; X86-SSE4A-NEXT: movsd 8(%ecx), %xmm1 # xmm1 = mem[0],zero 436; X86-SSE4A-NEXT: movsd 16(%ecx), %xmm2 # xmm2 = mem[0],zero 437; X86-SSE4A-NEXT: movsd 24(%ecx), %xmm3 # xmm3 = mem[0],zero 438; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) 439; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax) 440; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) 441; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax) 442; X86-SSE4A-NEXT: retl 443; 444; X64-SSE2-LABEL: merge_2_v4f32_align1: 445; X64-SSE2: # %bb.0: 446; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 447; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 448; X64-SSE2-NEXT: movq %xmm0, %rax 449; X64-SSE2-NEXT: movntiq %rax, (%rsi) 450; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 451; X64-SSE2-NEXT: movq %xmm0, %rax 452; X64-SSE2-NEXT: movntiq %rax, 8(%rsi) 453; X64-SSE2-NEXT: movq %xmm1, %rax 454; X64-SSE2-NEXT: movntiq %rax, 16(%rsi) 455; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 456; X64-SSE2-NEXT: movq %xmm0, %rax 457; X64-SSE2-NEXT: movntiq %rax, 24(%rsi) 458; X64-SSE2-NEXT: retq 459; 460; X64-SSE4A-LABEL: merge_2_v4f32_align1: 461; X64-SSE4A: # %bb.0: 462; X64-SSE4A-NEXT: movsd (%rdi), %xmm0 # xmm0 = mem[0],zero 463; X64-SSE4A-NEXT: movsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero 464; X64-SSE4A-NEXT: movsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero 465; X64-SSE4A-NEXT: movsd 24(%rdi), %xmm3 # xmm3 = mem[0],zero 466; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) 467; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi) 468; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) 469; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi) 470; X64-SSE4A-NEXT: retq 471; 472; X64-SSE41-LABEL: merge_2_v4f32_align1: 473; X64-SSE41: # %bb.0: 474; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 475; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1 476; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax 477; X64-SSE41-NEXT: movntiq %rax, 8(%rsi) 478; X64-SSE41-NEXT: movq %xmm0, %rax 479; X64-SSE41-NEXT: movntiq %rax, (%rsi) 480; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax 481; X64-SSE41-NEXT: movntiq %rax, 24(%rsi) 482; X64-SSE41-NEXT: movq %xmm1, %rax 483; X64-SSE41-NEXT: movntiq %rax, 16(%rsi) 484; X64-SSE41-NEXT: retq 485; 486; X64-AVX-LABEL: merge_2_v4f32_align1: 487; X64-AVX: # %bb.0: 488; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 489; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1 490; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax 491; X64-AVX-NEXT: movntiq %rax, 8(%rsi) 492; X64-AVX-NEXT: vmovq %xmm0, %rax 493; X64-AVX-NEXT: movntiq %rax, (%rsi) 494; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax 495; X64-AVX-NEXT: movntiq %rax, 24(%rsi) 496; X64-AVX-NEXT: vmovq %xmm1, %rax 497; X64-AVX-NEXT: movntiq %rax, 16(%rsi) 498; X64-AVX-NEXT: retq 499 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0 500 %2 = load <4 x float>, ptr %a0, align 1, !nontemporal !0 501 %3 = load <4 x float>, ptr %1, align 1, !nontemporal !0 502 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0 503 store <4 x float> %2, ptr %a1, align 1, !nontemporal !0 504 store <4 x float> %3, ptr %4, align 1, !nontemporal !0 505 ret void 506} 507 508!0 = !{i32 1} 509