1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2023 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30;;; 31;;; gf_2vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest); 32;;; 33 34%include "reg_sizes.asm" 35%include "gf_vect_gfni.inc" 36 37%if AS_FEATURE_LEVEL >= 10 38 39%ifidn __OUTPUT_FORMAT__, elf64 40 %define arg0 rdi 41 %define arg1 rsi 42 %define arg2 rdx 43 %define arg3 rcx 44 %define arg4 r8 45 %define arg5 r9 46 %define tmp r11 47 %define tmp2 r10 48 %define func(x) x: endbranch 49 %define FUNC_SAVE 50 %define FUNC_RESTORE 51%endif 52 53%ifidn __OUTPUT_FORMAT__, win64 54 %define arg0 rcx 55 %define arg1 rdx 56 %define arg2 r8 57 %define arg3 r9 58 %define arg4 r12 59 %define arg5 r13 60 %define tmp r11 61 %define tmp2 r10 62 %define stack_size 16 + 3*8 ; must be an odd multiple of 8 63 %define arg(x) [rsp + stack_size + 8 + 8*x] 64 65 %define func(x) proc_frame x 66 %macro FUNC_SAVE 0 67 sub rsp, stack_size 68 vmovdqa [rsp + 16*0], xmm6 69 mov [rsp + 16 + 0*8], r12 70 mov [rsp + 16 + 1*8], r13 71 end_prolog 72 mov arg4, arg(4) 73 mov arg5, arg(5) 74 %endmacro 75 76 %macro FUNC_RESTORE 0 77 vmovdqa xmm6, [rsp + 16*0] 78 mov r12, [rsp + 16 + 0*8] 79 mov r13, [rsp + 16 + 1*8] 80 add rsp, stack_size 81 %endmacro 82%endif 83 84%define len arg0 85%define vec arg1 86%define vec_i arg2 87%define mul_array arg3 88%define src arg4 89%define dest1 arg5 90%define pos rax 91%define dest2 tmp2 92 93%ifndef EC_ALIGNED_ADDR 94;;; Use Un-aligned load/store 95 %define XLDR vmovdqu8 96 %define XSTR vmovdqu8 97%else 98;;; Use Non-temporal load/stor 99 %ifdef NO_NT_LDST 100 %define XLDR vmovdqa64 101 %define XSTR vmovdqa64 102 %else 103 %define XLDR vmovntdqa 104 %define XSTR vmovntdq 105 %endif 106%endif 107 108default rel 109[bits 64] 110section .text 111 112%define x0 zmm0 113%define xd1 zmm1 114%define xd2 zmm2 115%define xgft1 zmm3 116%define xgft2 zmm4 117%define xret1 zmm5 118%define xret2 zmm6 119 120;; 121;; Encodes 64 bytes of a single source into 2x 64 bytes (parity disks) 122;; 123%macro ENCODE_64B_2 0-1 124%define %%KMASK %1 125 126%if %0 == 1 127 vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector 128 vmovdqu8 xd1{%%KMASK}, [dest1 + pos] ;Get next dest vector 129 vmovdqu8 xd2{%%KMASK}, [dest2 + pos] ;Get next dest vector 130%else 131 XLDR x0, [src + pos] ;Get next source vector 132 XLDR xd1, [dest1 + pos] ;Get next dest vector 133 XLDR xd2, [dest2 + pos] ;Get next dest vector 134%endif 135 136 GF_MUL_XOR EVEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2 137 138%if %0 == 1 139 vmovdqu8 [dest1 + pos]{%%KMASK}, xd1 140 vmovdqu8 [dest2 + pos]{%%KMASK}, xd2 141%else 142 XSTR [dest1 + pos], xd1 143 XSTR [dest2 + pos], xd2 144%endif 145%endmacro 146 147align 16 148mk_global gf_2vect_mad_avx512_gfni, function 149func(gf_2vect_mad_avx512_gfni) 150 FUNC_SAVE 151 152 xor pos, pos 153 shl vec_i, 3 ;Multiply by 8 154 shl vec, 3 155 lea tmp, [mul_array + vec_i] 156 vbroadcastf32x2 xgft1, [tmp] 157 vbroadcastf32x2 xgft2, [tmp + vec] 158 mov dest2, [dest1 + 8] ; reuse mul_array 159 mov dest1, [dest1] 160 161.loop64: 162 ENCODE_64B_2 163 164 add pos, 64 ;Loop on 64 bytes at a time 165 sub len, 64 166 cmp len, 64 167 jge .loop64 168 169.len_lt_64: 170 cmp len, 0 171 jle .exit 172 173 xor tmp, tmp 174 bts tmp, len 175 dec tmp 176 kmovq k1, tmp 177 178 ENCODE_64B_2 k1 179 180.exit: 181 vzeroupper 182 183 FUNC_RESTORE 184 ret 185 186endproc_frame 187%endif ; if AS_FEATURE_LEVEL >= 10 188