1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X32 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 4 5define void @big_nonzero_16_bytes(ptr nocapture %a) { 6; X32-LABEL: big_nonzero_16_bytes: 7; X32: # %bb.0: 8; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 9; X32-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4] 10; X32-NEXT: vmovups %xmm0, (%eax) 11; X32-NEXT: retl 12; 13; X64-LABEL: big_nonzero_16_bytes: 14; X64: # %bb.0: 15; X64-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4] 16; X64-NEXT: vmovups %xmm0, (%rdi) 17; X64-NEXT: retq 18 %arrayidx1 = getelementptr inbounds i32, ptr %a, i64 1 19 %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 2 20 %arrayidx3 = getelementptr inbounds i32, ptr %a, i64 3 21 22 store i32 1, ptr %a, align 4 23 store i32 2, ptr %arrayidx1, align 4 24 store i32 3, ptr %arrayidx2, align 4 25 store i32 4, ptr %arrayidx3, align 4 26 ret void 27} 28 29; TODO: We assumed that two 64-bit stores were better than 1 vector load and 1 vector store. 30; But if the 64-bit constants can't be represented as sign-extended 32-bit constants, then 31; it takes extra instructions to do this in scalar. 32 33define void @big_nonzero_16_bytes_big64bit_constants(ptr nocapture %a) { 34; X32-LABEL: big_nonzero_16_bytes_big64bit_constants: 35; X32: # %bb.0: 36; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 37; X32-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,3] 38; X32-NEXT: vmovups %xmm0, (%eax) 39; X32-NEXT: retl 40; 41; X64-LABEL: big_nonzero_16_bytes_big64bit_constants: 42; X64: # %bb.0: 43; X64-NEXT: movabsq $4294967297, %rax # imm = 0x100000001 44; X64-NEXT: movq %rax, (%rdi) 45; X64-NEXT: movabsq $12884901889, %rax # imm = 0x300000001 46; X64-NEXT: movq %rax, 8(%rdi) 47; X64-NEXT: retq 48 %arrayidx1 = getelementptr inbounds i64, ptr %a, i64 1 49 50 store i64 4294967297, ptr %a 51 store i64 12884901889, ptr %arrayidx1 52 ret void 53} 54 55; Splats may be an opportunity to use a broadcast op. 56 57define void @big_nonzero_32_bytes_splat(ptr nocapture %a) { 58; X32-LABEL: big_nonzero_32_bytes_splat: 59; X32: # %bb.0: 60; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 61; X32-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] 62; X32-NEXT: vmovups %ymm0, (%eax) 63; X32-NEXT: vzeroupper 64; X32-NEXT: retl 65; 66; X64-LABEL: big_nonzero_32_bytes_splat: 67; X64: # %bb.0: 68; X64-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] 69; X64-NEXT: vmovups %ymm0, (%rdi) 70; X64-NEXT: vzeroupper 71; X64-NEXT: retq 72 %arrayidx1 = getelementptr inbounds i32, ptr %a, i64 1 73 %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 2 74 %arrayidx3 = getelementptr inbounds i32, ptr %a, i64 3 75 %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 4 76 %arrayidx5 = getelementptr inbounds i32, ptr %a, i64 5 77 %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 6 78 %arrayidx7 = getelementptr inbounds i32, ptr %a, i64 7 79 80 store i32 42, ptr %a, align 4 81 store i32 42, ptr %arrayidx1, align 4 82 store i32 42, ptr %arrayidx2, align 4 83 store i32 42, ptr %arrayidx3, align 4 84 store i32 42, ptr %arrayidx4, align 4 85 store i32 42, ptr %arrayidx5, align 4 86 store i32 42, ptr %arrayidx6, align 4 87 store i32 42, ptr %arrayidx7, align 4 88 ret void 89} 90 91; Verify that we choose the best-sized store(s) for each chunk. 92 93define void @big_nonzero_63_bytes(ptr nocapture %a) { 94; X32-LABEL: big_nonzero_63_bytes: 95; X32: # %bb.0: 96; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 97; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1,0,2,0,3,0,4,0] 98; X32-NEXT: vmovups %ymm0, (%eax) 99; X32-NEXT: vmovaps {{.*#+}} xmm0 = [5,0,6,0] 100; X32-NEXT: vmovups %xmm0, 32(%eax) 101; X32-NEXT: movl $0, 52(%eax) 102; X32-NEXT: movl $7, 48(%eax) 103; X32-NEXT: movl $8, 56(%eax) 104; X32-NEXT: movw $9, 60(%eax) 105; X32-NEXT: movb $10, 62(%eax) 106; X32-NEXT: vzeroupper 107; X32-NEXT: retl 108; 109; X64-LABEL: big_nonzero_63_bytes: 110; X64: # %bb.0: 111; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,3,4] 112; X64-NEXT: vmovups %ymm0, (%rdi) 113; X64-NEXT: movq $5, 32(%rdi) 114; X64-NEXT: movq $6, 40(%rdi) 115; X64-NEXT: movq $7, 48(%rdi) 116; X64-NEXT: movl $8, 56(%rdi) 117; X64-NEXT: movw $9, 60(%rdi) 118; X64-NEXT: movb $10, 62(%rdi) 119; X64-NEXT: vzeroupper 120; X64-NEXT: retq 121 %arrayidx8 = getelementptr inbounds i64, ptr %a, i64 1 122 %arrayidx16 = getelementptr inbounds i64, ptr %a, i64 2 123 %arrayidx24 = getelementptr inbounds i64, ptr %a, i64 3 124 %arrayidx32 = getelementptr inbounds i64, ptr %a, i64 4 125 %arrayidx40 = getelementptr inbounds i64, ptr %a, i64 5 126 %arrayidx48 = getelementptr inbounds i64, ptr %a, i64 6 127 %arrayidx56 = getelementptr inbounds i32, ptr %a, i64 14 128 %arrayidx60 = getelementptr inbounds i16, ptr %a, i64 30 129 %arrayidx62 = getelementptr inbounds i8, ptr %a, i64 62 130 131 store i64 1, ptr %a 132 store i64 2, ptr %arrayidx8 133 store i64 3, ptr %arrayidx16 134 store i64 4, ptr %arrayidx24 135 store i64 5, ptr %arrayidx32 136 store i64 6, ptr %arrayidx40 137 store i64 7, ptr %arrayidx48 138 store i32 8, ptr %arrayidx56 139 store i16 9, ptr %arrayidx60 140 store i8 10, ptr %arrayidx62 141 ret void 142} 143 144