1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2023 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30;;; 31;;; gf_3vect_mad_avx512_gfni(len, vec, vec_i, mul_array, src, dest); 32;;; 33 34%include "reg_sizes.asm" 35%include "gf_vect_gfni.inc" 36 37%if AS_FEATURE_LEVEL >= 10 38 39%ifidn __OUTPUT_FORMAT__, elf64 40 %define arg0 rdi 41 %define arg1 rsi 42 %define arg2 rdx 43 %define arg3 rcx 44 %define arg4 r8 45 %define arg5 r9 46 %define tmp r11 47 %define func(x) x: endbranch 48 %define FUNC_SAVE 49 %define FUNC_RESTORE 50%endif 51 52%ifidn __OUTPUT_FORMAT__, win64 53 %define arg0 rcx 54 %define arg1 rdx 55 %define arg2 r8 56 %define arg3 r9 57 %define arg4 r12 ; must be saved, loaded and restored 58 %define arg5 r13 ; must be saved and restored 59 %define tmp r11 60 %define stack_size 16*4 + 3*8 61 %define arg(x) [rsp + stack_size + 8 + 8*x] 62 %define func(x) proc_frame x 63 64 %macro FUNC_SAVE 0 65 sub rsp, stack_size 66 vmovdqa [rsp + 16*0], xmm6 67 vmovdqa [rsp + 16*1], xmm7 68 vmovdqa [rsp + 16*2], xmm8 69 vmovdqa [rsp + 16*3], xmm9 70 mov [rsp + 4*16 + 0*8], r12 71 mov [rsp + 4*16 + 1*8], r13 72 end_prolog 73 mov arg4, arg(4) 74 mov arg5, arg(5) 75 %endmacro 76 77 %macro FUNC_RESTORE 0 78 vmovdqa xmm6, [rsp + 16*0] 79 vmovdqa xmm7, [rsp + 16*1] 80 vmovdqa xmm8, [rsp + 16*2] 81 vmovdqa xmm9, [rsp + 16*3] 82 mov r12, [rsp + 4*16 + 0*8] 83 mov r13, [rsp + 4*16 + 1*8] 84 add rsp, stack_size 85 %endmacro 86%endif 87 88%define len arg0 89%define vec arg1 90%define vec_i arg2 91%define mul_array arg3 92%define src arg4 93%define dest1 arg5 94%define pos rax 95%define dest2 mul_array 96%define dest3 vec_i 97 98%ifndef EC_ALIGNED_ADDR 99;;; Use Un-aligned load/store 100 %define XLDR vmovdqu8 101 %define XSTR vmovdqu8 102%else 103;;; Use Non-temporal load/stor 104 %ifdef NO_NT_LDST 105 %define XLDR vmovdqa64 106 %define XSTR vmovdqa64 107 %else 108 %define XLDR vmovntdqa 109 %define XSTR vmovntdq 110 %endif 111%endif 112 113default rel 114[bits 64] 115section .text 116 117%define x0 zmm0 118%define xgft1 zmm1 119%define xgft2 zmm2 120%define xgft3 zmm3 121%define xd1 zmm4 122%define xd2 zmm5 123%define xd3 zmm6 124 125%define xret1 zmm7 126%define xret2 zmm8 127%define xret3 zmm9 128 129;; 130;; Encodes 64 bytes of a single source into 3x 64 bytes (parity disks) 131;; 132%macro ENCODE_64B_3 0-1 133%define %%KMASK %1 134 135%if %0 == 1 136 vmovdqu8 x0{%%KMASK}, [src + pos] ;Get next source vector 137 vmovdqu8 xd1{%%KMASK}, [dest1 + pos] ;Get next dest vector 138 vmovdqu8 xd2{%%KMASK}, [dest2 + pos] ;Get next dest vector 139 vmovdqu8 xd3{%%KMASK}, [dest3 + pos] ;Get next dest vector 140%else 141 XLDR x0, [src + pos] ;Get next source vector 142 XLDR xd1, [dest1 + pos] ;Get next dest vector 143 XLDR xd2, [dest2 + pos] ;Get next dest vector 144 XLDR xd3, [dest3 + pos] ;Get next dest vector 145%endif 146 147 GF_MUL_XOR EVEX, x0, xgft1, xret1, xd1, xgft2, xret2, xd2, xgft3, xret3, xd3 148 149%if %0 == 1 150 vmovdqu8 [dest1 + pos]{%%KMASK}, xd1 151 vmovdqu8 [dest2 + pos]{%%KMASK}, xd2 152 vmovdqu8 [dest3 + pos]{%%KMASK}, xd3 153%else 154 XSTR [dest1 + pos], xd1 155 XSTR [dest2 + pos], xd2 156 XSTR [dest3 + pos], xd3 157%endif 158%endmacro 159 160align 16 161mk_global gf_3vect_mad_avx512_gfni, function 162func(gf_3vect_mad_avx512_gfni) 163 FUNC_SAVE 164 165 xor pos, pos 166 shl vec_i, 3 ;Multiply by 8 167 shl vec, 3 ;Multiply by 8 168 lea tmp, [mul_array + vec_i] 169 vbroadcastf32x2 xgft1, [tmp] 170 vbroadcastf32x2 xgft2, [tmp + vec] 171 vbroadcastf32x2 xgft3, [tmp + vec*2] 172 mov dest2, [dest1 + 8] ; reuse mul_array 173 mov dest3, [dest1 + 2*8] ; reuse vec 174 mov dest1, [dest1] 175 176.loop64: 177 ENCODE_64B_3 178 179 add pos, 64 ;Loop on 64 bytes at a time 180 sub len, 64 181 cmp len, 64 182 jge .loop64 183 184.len_lt_64: 185 cmp len, 0 186 jle .exit 187 188 xor tmp, tmp 189 bts tmp, len 190 dec tmp 191 kmovq k1, tmp 192 193 ENCODE_64B_3 k1 194 195.exit: 196 vzeroupper 197 198 FUNC_RESTORE 199 ret 200 201endproc_frame 202%endif ; if AS_FEATURE_LEVEL >= 10 203