1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2023 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30;;; 31;;; gf_2vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests); 32;;; 33 34%include "reg_sizes.asm" 35%include "gf_vect_gfni.inc" 36 37%if AS_FEATURE_LEVEL >= 10 38 39%ifidn __OUTPUT_FORMAT__, elf64 40 %define arg0 rdi 41 %define arg1 rsi 42 %define arg2 rdx 43 %define arg3 rcx 44 %define arg4 r8 45 %define arg5 r9 46 47 %define tmp r11 48 %define tmp2 r10 49 %define tmp3 r12 ; must be saved and restored 50 51 %define func(x) x: endbranch 52 %macro FUNC_SAVE 0 53 push r12 54 %endmacro 55 %macro FUNC_RESTORE 0 56 pop r12 57 %endmacro 58%endif 59 60%ifidn __OUTPUT_FORMAT__, win64 61 %define arg0 rcx 62 %define arg1 rdx 63 %define arg2 r8 64 %define arg3 r9 65 66 %define arg4 r12 ; must be saved, loaded and restored 67 %define arg5 r14 ; must be saved and restored 68 %define tmp r11 69 %define tmp2 r10 70 %define tmp3 r13 ; must be saved and restored 71 %define stack_size 3*8 ; must be an odd multiple of 8 72 %define arg(x) [rsp + stack_size + 8 + 8*x] 73 74 %define func(x) proc_frame x 75 %macro FUNC_SAVE 0 76 alloc_stack stack_size 77 mov [rsp + 0*8], r12 78 mov [rsp + 1*8], r13 79 mov [rsp + 2*8], r14 80 end_prolog 81 mov arg4, arg(4) 82 %endmacro 83 84 %macro FUNC_RESTORE 0 85 mov r12, [rsp + 0*8] 86 mov r13, [rsp + 1*8] 87 mov r14, [rsp + 2*8] 88 add rsp, stack_size 89 %endmacro 90%endif 91 92 93%define len arg0 94%define vec arg1 95%define mul_array arg2 96%define src arg3 97%define dest1 arg4 98%define ptr arg5 99%define vec_i tmp2 100%define dest2 tmp3 101%define pos rax 102 103 104%ifndef EC_ALIGNED_ADDR 105;;; Use Un-aligned load/store 106 %define XLDR vmovdqu8 107 %define XSTR vmovdqu8 108%else 109;;; Use Non-temporal load/stor 110 %ifdef NO_NT_LDST 111 %define XLDR vmovdqa64 112 %define XSTR vmovdqa64 113 %else 114 %define XLDR vmovntdqa 115 %define XSTR vmovntdq 116 %endif 117%endif 118 119%define xgft1 zmm3 120%define xgft2 zmm4 121 122%define x0 zmm0 123%define xp1 zmm1 124%define xp2 zmm2 125 126default rel 127[bits 64] 128 129section .text 130 131;; 132;; Encodes 64 bytes of all "k" sources into 2x 64 bytes (parity disks) 133;; 134%macro ENCODE_64B_2 0-1 135%define %%KMASK %1 136 137 vpxorq xp1, xp1, xp1 138 vpxorq xp2, xp2, xp2 139 mov tmp, mul_array 140 xor vec_i, vec_i 141 142%%next_vect: 143 mov ptr, [src + vec_i] 144%if %0 == 1 145 vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes) 146%else 147 XLDR x0, [ptr + pos] ;Get next source vector (64 bytes) 148%endif 149 add vec_i, 8 150 151 vbroadcastf32x2 xgft1, [tmp] 152 vbroadcastf32x2 xgft2, [tmp + vec] 153 add tmp, 8 154 155 GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2 156 157 cmp vec_i, vec 158 jl %%next_vect 159 160%if %0 == 1 161 vmovdqu8 [dest1 + pos]{%%KMASK}, xp1 162 vmovdqu8 [dest2 + pos]{%%KMASK}, xp2 163%else 164 XSTR [dest1 + pos], xp1 165 XSTR [dest2 + pos], xp2 166%endif 167%endmacro 168 169align 16 170mk_global gf_2vect_dot_prod_avx512_gfni, function 171func(gf_2vect_dot_prod_avx512_gfni) 172 FUNC_SAVE 173 174 xor pos, pos 175 shl vec, 3 ;vec *= 8. Make vec_i count by 8 176 mov dest2, [dest1 + 8] 177 mov dest1, [dest1] 178 179 cmp len, 64 180 jb .len_lt_64 181 182.loop64: 183 184 ENCODE_64B_2 185 186 add pos, 64 ;Loop on 64 bytes at a time 187 sub len, 64 188 cmp len, 64 189 jge .loop64 190 191.len_lt_64: 192 cmp len, 0 193 jle .exit 194 195 xor tmp, tmp 196 bts tmp, len 197 dec tmp 198 kmovq k1, tmp 199 200 ENCODE_64B_2 k1 201 202.exit: 203 vzeroupper 204 205 FUNC_RESTORE 206 ret 207 208endproc_frame 209%endif ; if AS_FEATURE_LEVEL >= 10 210