1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2015 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30;;; Optimized xor of N source vectors using AVX 31;;; int xor_gen_avx(int vects, int len, void **array) 32 33;;; Generates xor parity vector from N (vects-1) sources in array of pointers 34;;; (**array). Last pointer is the dest. 35;;; Vectors must be aligned to 32 bytes. Length can be any value. 36 37%include "reg_sizes.asm" 38 39%ifidn __OUTPUT_FORMAT__, elf64 40 %define arg0 rdi 41 %define arg1 rsi 42 %define arg2 rdx 43 %define arg3 rcx 44 %define arg4 r8 45 %define arg5 r9 46 %define tmp r11 47 %define tmp3 arg4 48 %define func(x) x: endbranch 49 %define return rax 50 %define FUNC_SAVE 51 %define FUNC_RESTORE 52 53%elifidn __OUTPUT_FORMAT__, win64 54 %define arg0 rcx 55 %define arg1 rdx 56 %define arg2 r8 57 %define arg3 r9 58 %define tmp r11 59 %define tmp3 r10 60 %define func(x) proc_frame x 61 %define return rax 62 %define stack_size 2*32 + 8 ;must be an odd multiple of 8 63 64 %macro FUNC_SAVE 0 65 alloc_stack stack_size 66 vmovdqu [rsp + 0*32], ymm6 67 vmovdqu [rsp + 1*32], ymm7 68 end_prolog 69 %endmacro 70 %macro FUNC_RESTORE 0 71 vmovdqu ymm6, [rsp + 0*32] 72 vmovdqu ymm7, [rsp + 1*32] 73 add rsp, stack_size 74 %endmacro 75 76%endif ;output formats 77 78 79%define vec arg0 80%define len arg1 81%define ptr arg3 82%define tmp2 rax 83%define tmp2.b al 84%define pos tmp3 85%define PS 8 86 87;;; Use Non-temporal load/stor 88%ifdef NO_NT_LDST 89 %define XLDR vmovdqa 90 %define XSTR vmovdqa 91%else 92 %define XLDR vmovdqa 93 %define XSTR vmovntdq 94%endif 95 96 97default rel 98[bits 64] 99 100section .text 101 102align 16 103mk_global xor_gen_avx, function 104func(xor_gen_avx) 105 106 FUNC_SAVE 107 sub vec, 2 ;Keep as offset to last source 108 jng return_fail ;Must have at least 2 sources 109 cmp len, 0 110 je return_pass 111 test len, (128-1) ;Check alignment of length 112 jnz len_not_aligned 113 114 115len_aligned_128bytes: 116 sub len, 128 117 mov pos, 0 118 119loop128: 120 mov tmp, vec ;Back to last vector 121 mov tmp2, [arg2+vec*PS] ;Fetch last pointer in array 122 sub tmp, 1 ;Next vect 123 XLDR ymm0, [tmp2+pos] ;Start with end of array in last vector 124 XLDR ymm1, [tmp2+pos+32] ;Keep xor parity in xmm0-7 125 XLDR ymm2, [tmp2+pos+(2*32)] 126 XLDR ymm3, [tmp2+pos+(3*32)] 127 128next_vect: 129 mov ptr, [arg2+tmp*PS] 130 sub tmp, 1 131 XLDR ymm4, [ptr+pos] ;Get next vector (source) 132 XLDR ymm5, [ptr+pos+32] 133 XLDR ymm6, [ptr+pos+(2*32)] 134 XLDR ymm7, [ptr+pos+(3*32)] 135 vxorpd ymm0, ymm0, ymm4 ;Add to xor parity 136 vxorpd ymm1, ymm1, ymm5 137 vxorpd ymm2, ymm2, ymm6 138 vxorpd ymm3, ymm3, ymm7 139 jge next_vect ;Loop for each source 140 141 mov ptr, [arg2+PS+vec*PS] ;Address of parity vector 142 XSTR [ptr+pos], ymm0 ;Write parity xor vector 143 XSTR [ptr+pos+(1*32)], ymm1 144 XSTR [ptr+pos+(2*32)], ymm2 145 XSTR [ptr+pos+(3*32)], ymm3 146 add pos, 128 147 cmp pos, len 148 jle loop128 149 150return_pass: 151 FUNC_RESTORE 152 mov return, 0 153 ret 154 155 156;;; Do one byte at a time for no alignment case 157loop_1byte: 158 mov tmp, vec ;Back to last vector 159 mov ptr, [arg2+vec*PS] ;Fetch last pointer in array 160 mov tmp2.b, [ptr+len-1] ;Get array n 161 sub tmp, 1 162nextvect_1byte: 163 mov ptr, [arg2+tmp*PS] 164 xor tmp2.b, [ptr+len-1] 165 sub tmp, 1 166 jge nextvect_1byte 167 168 mov tmp, vec 169 add tmp, 1 ;Add back to point to last vec 170 mov ptr, [arg2+tmp*PS] 171 mov [ptr+len-1], tmp2.b ;Write parity 172 sub len, 1 173 test len, (PS-1) 174 jnz loop_1byte 175 176 cmp len, 0 177 je return_pass 178 test len, (128-1) ;If not 0 and 128bit aligned 179 jz len_aligned_128bytes ; then do aligned case. len = y * 128 180 181 ;; else we are 8-byte aligned so fall through to recheck 182 183 184 ;; Unaligned length cases 185len_not_aligned: 186 test len, (PS-1) 187 jne loop_1byte 188 mov tmp3, len 189 and tmp3, (128-1) ;Do the unaligned bytes 8 at a time 190 191 ;; Run backwards 8 bytes at a time for (tmp3) bytes 192loop8_bytes: 193 mov tmp, vec ;Back to last vector 194 mov ptr, [arg2+vec*PS] ;Fetch last pointer in array 195 mov tmp2, [ptr+len-PS] ;Get array n 196 sub tmp, 1 197nextvect_8bytes: 198 mov ptr, [arg2+tmp*PS] ;Get pointer to next vector 199 xor tmp2, [ptr+len-PS] 200 sub tmp, 1 201 jge nextvect_8bytes ;Loop for each source 202 203 mov tmp, vec 204 add tmp, 1 ;Add back to point to last vec 205 mov ptr, [arg2+tmp*PS] 206 mov [ptr+len-PS], tmp2 ;Write parity 207 sub len, PS 208 sub tmp3, PS 209 jg loop8_bytes 210 211 cmp len, 128 ;Now len is aligned to 128B 212 jge len_aligned_128bytes ;We can do the rest aligned 213 214 cmp len, 0 215 je return_pass 216 217return_fail: 218 FUNC_RESTORE 219 mov return, 1 220 ret 221 222endproc_frame 223 224section .data 225 226;;; func core, ver, snum 227slversion xor_gen_avx, 02, 05, 0037 228 229