1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2017 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30;;; Optimized pq of N source vectors using AVX512 31;;; int pq_gen_avx512(int vects, int len, void **array) 32 33;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers 34;;; (**array). Last two pointers are the P and Q destinations respectively. 35;;; Vectors must be aligned to 64 bytes if NO_NT_LDST is not defined. 36;;; Length must be 32 byte multiple. 37 38%include "reg_sizes.asm" 39 40%ifdef HAVE_AS_KNOWS_AVX512 41 42%ifidn __OUTPUT_FORMAT__, elf64 43 %define arg0 rdi 44 %define arg1 rsi 45 %define arg2 rdx 46 %define arg3 rcx 47 %define arg4 r8 48 %define arg5 r9 49 %define tmp r11 50 %define tmp3 arg4 51 %define return rax 52 %define func(x) x: endbranch 53 %define FUNC_SAVE 54 %define FUNC_RESTORE 55%endif 56 57%ifidn __OUTPUT_FORMAT__, win64 58 %define arg0 rcx 59 %define arg1 rdx 60 %define arg2 r8 61 %define arg3 r9 62 %define tmp r11 63 %define tmp3 r10 64 %define return rax 65 %define stack_size 4*16 + 8 ; must be an odd multiple of 8 66 %define func(x) proc_frame x 67 %macro FUNC_SAVE 0 68 alloc_stack stack_size 69 vmovdqu [rsp + 0*16], xmm6 70 vmovdqu [rsp + 1*16], xmm7 71 vmovdqu [rsp + 2*16], xmm8 72 vmovdqu [rsp + 3*16], xmm9 73 end_prolog 74 %endmacro 75 76 %macro FUNC_RESTORE 0 77 vmovdqu xmm6, [rsp + 0*16] 78 vmovdqu xmm7, [rsp + 1*16] 79 vmovdqu xmm8, [rsp + 2*16] 80 vmovdqu xmm9, [rsp + 3*16] 81 add rsp, stack_size 82 %endmacro 83%endif 84 85%define vec arg0 86%define len arg1 87%define ptr arg3 88%define pos rax 89 90%define xp1 zmm0 91%define xq1 zmm1 92%define xtmp1 zmm2 93%define xs1 zmm3 94 95%define xp2 zmm4 96%define xq2 zmm5 97%define xtmp2 zmm6 98%define xs2 zmm7 99 100%define xzero zmm8 101%define xpoly zmm9 102 103%define xp1y ymm0 104%define xq1y ymm1 105%define xtmp1y ymm2 106%define xs1y ymm3 107%define xzeroy ymm8 108%define xpolyy ymm9 109 110%define NO_NT_LDST 111;;; Use Non-temporal load/stor 112%ifdef NO_NT_LDST 113 %define XLDR vmovdqu8 ;u8 114 %define XSTR vmovdqu8 115%else 116 %define XLDR vmovntdqa 117 %define XSTR vmovntdq 118%endif 119 120default rel 121 122[bits 64] 123section .text 124 125align 16 126mk_global pq_gen_avx512, function 127func(pq_gen_avx512) 128 FUNC_SAVE 129 sub vec, 3 ;Keep as offset to last source 130 jng return_fail ;Must have at least 2 sources 131 cmp len, 0 132 je return_pass 133 test len, (32-1) ;Check alignment of length 134 jnz return_fail 135 mov pos, 0 136 mov tmp, 0x1d 137 vpbroadcastb xpoly, tmp 138 vpxorq xzero, xzero, xzero 139 cmp len, 128 140 jl loop32 141 142len_aligned_32bytes: 143 sub len, 2*64 ;Len points to last block 144 145loop128: 146 mov ptr, [arg2+vec*8] ;Fetch last source pointer 147 mov tmp, vec ;Set tmp to point back to last vector 148 XLDR xs1, [ptr+pos] ;Preload last vector (source) 149 XLDR xs2, [ptr+pos+64] ;Preload last vector (source) 150 vpxorq xp1, xp1, xp1 ;p1 = 0 151 vpxorq xp2, xp2, xp2 ;p2 = 0 152 vpxorq xq1, xq1, xq1 ;q1 = 0 153 vpxorq xq2, xq2, xq2 ;q2 = 0 154 155next_vect: 156 sub tmp, 1 ;Inner loop for each source vector 157 mov ptr, [arg2+tmp*8] ; get pointer to next vect 158 vpxorq xq1, xq1, xs1 ; q1 ^= s1 159 vpxorq xq2, xq2, xs2 ; q2 ^= s2 160 vpxorq xp1, xp1, xs1 ; p1 ^= s1 161 vpxorq xp2, xp2, xs2 ; p2 ^= s2 162 vpcmpb k1, xq1, xzero, 1 163 vpcmpb k2, xq2, xzero, 1 164 vpblendmb xtmp1 {k1}, xzero, xpoly 165 vpblendmb xtmp2 {k2}, xzero, xpoly 166 XLDR xs1, [ptr+pos] ; Get next vector (source data1) 167 XLDR xs2, [ptr+pos+64] ; Get next vector (source data2) 168 vpaddb xq1, xq1, xq1 ; q1 = q1<<1 169 vpaddb xq2, xq2, xq2 ; q2 = q2<<1 170 vpxorq xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked 171 vpxorq xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked 172 jg next_vect ; Loop for each vect except 0 173 174 mov ptr, [arg2+8+vec*8] ;Get address of P parity vector 175 mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector 176 vpxorq xp1, xp1, xs1 ;p1 ^= s1[0] - last source is already loaded 177 vpxorq xq1, xq1, xs1 ;q1 ^= 1 * s1[0] 178 vpxorq xp2, xp2, xs2 ;p2 ^= s2[0] 179 vpxorq xq2, xq2, xs2 ;q2 ^= 1 * s2[0] 180 XSTR [ptr+pos], xp1 ;Write parity P1 vector 181 XSTR [ptr+pos+64], xp2 ;Write parity P2 vector 182 XSTR [tmp+pos], xq1 ;Write parity Q1 vector 183 XSTR [tmp+pos+64], xq2 ;Write parity Q2 vector 184 add pos, 2*64 185 cmp pos, len 186 jle loop128 187 188 ;; ------------------------------ 189 ;; Do last 32 or 64 Bytes remaining 190 add len, 2*64 191 cmp pos, len 192 je return_pass 193 194loop32: 195 mov ptr, [arg2+vec*8] ;Fetch last source pointer 196 mov tmp, vec ;Set tmp to point back to last vector 197 XLDR xs1y, [ptr+pos] ;Preload last vector (source) 198 vpxorq xp1y, xp1y, xp1y ;p = 0 199 vpxorq xq1y, xq1y, xq1y ;q = 0 200 201next_vect32: 202 sub tmp, 1 ;Inner loop for each source vector 203 mov ptr, [arg2+tmp*8] ; get pointer to next vect 204 vpxorq xq1y, xq1y, xs1y ; q1 ^= s1 205 vpblendvb xtmp1y, xzeroy, xpolyy, xq1y ; xtmp1 = poly or 0x00 206 vpxorq xp1y, xp1y, xs1y ; p ^= s 207 vpaddb xq1y, xq1y, xq1y ; q = q<<1 208 vpxorq xq1y, xq1y, xtmp1y ; q = q<<1 ^ poly_masked 209 XLDR xs1y, [ptr+pos] ; Get next vector (source data) 210 jg next_vect32 ; Loop for each vect except 0 211 212 mov ptr, [arg2+8+vec*8] ;Get address of P parity vector 213 mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector 214 vpxorq xp1y, xp1y, xs1y ;p ^= s[0] - last source is already loaded 215 vpxorq xq1y, xq1y, xs1y ;q ^= 1 * s[0] 216 XSTR [ptr+pos], xp1y ;Write parity P vector 217 XSTR [tmp+pos], xq1y ;Write parity Q vector 218 add pos, 32 219 cmp pos, len 220 jl loop32 221 222 223return_pass: 224 mov return, 0 225 FUNC_RESTORE 226 ret 227 228return_fail: 229 mov return, 1 230 FUNC_RESTORE 231 ret 232 233endproc_frame 234 235%endif ; ifdef HAVE_AS_KNOWS_AVX512 236