1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2023 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30;;; 31;;; gf_6vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests); 32;;; 33 34%include "reg_sizes.asm" 35%include "gf_vect_gfni.inc" 36 37%if AS_FEATURE_LEVEL >= 10 38 39%ifidn __OUTPUT_FORMAT__, elf64 40 %define arg0 rdi 41 %define arg1 rsi 42 %define arg2 rdx 43 %define arg3 rcx 44 %define arg4 r8 45 %define arg5 r9 46 47 %define tmp r11 48 %define tmp2 r10 49 %define tmp3 r13 ; must be saved and restored 50 %define tmp4 r12 ; must be saved and restored 51 %define tmp5 r14 ; must be saved and restored 52 %define tmp6 r15 ; must be saved and restored 53 %define tmp7 rbp ; must be saved and restored 54 %define tmp8 rbx ; must be saved and restored 55 56 %define func(x) x: endbranch 57 %macro FUNC_SAVE 0 58 push r12 59 push r13 60 push r14 61 push r15 62 push rbp 63 push rbx 64 %endmacro 65 %macro FUNC_RESTORE 0 66 pop rbx 67 pop rbp 68 pop r15 69 pop r14 70 pop r13 71 pop r12 72 %endmacro 73%endif 74 75%ifidn __OUTPUT_FORMAT__, win64 76 %define arg0 rcx 77 %define arg1 rdx 78 %define arg2 r8 79 %define arg3 r9 80 81 %define arg4 r12 ; must be saved, loaded and restored 82 %define arg5 r15 ; must be saved and restored 83 %define tmp r11 84 %define tmp2 r10 85 %define tmp3 r13 ; must be saved and restored 86 %define tmp4 r14 ; must be saved and restored 87 %define tmp5 rdi ; must be saved and restored 88 %define tmp6 rsi ; must be saved and restored 89 %define tmp7 rbp ; must be saved and restored 90 %define tmp8 rbx ; must be saved and restored 91 %define stack_size 7*16 + 9*8 ; must be an odd multiple of 8 92 %define arg(x) [rsp + stack_size + 8 + 8*x] 93 94 %define func(x) proc_frame x 95 %macro FUNC_SAVE 0 96 alloc_stack stack_size 97 vmovdqa [rsp + 0*16], xmm6 98 vmovdqa [rsp + 1*16], xmm7 99 vmovdqa [rsp + 2*16], xmm8 100 vmovdqa [rsp + 3*16], xmm9 101 vmovdqa [rsp + 4*16], xmm10 102 vmovdqa [rsp + 5*16], xmm11 103 vmovdqa [rsp + 6*16], xmm12 104 mov [rsp + 7*16 + 0*8], r12 105 mov [rsp + 7*16 + 1*8], r13 106 mov [rsp + 7*16 + 2*8], r14 107 mov [rsp + 7*16 + 3*8], r15 108 mov [rsp + 7*16 + 4*8], rdi 109 mov [rsp + 7*16 + 5*8], rsi 110 mov [rsp + 7*16 + 6*8], rbp 111 mov [rsp + 7*16 + 7*8], rbx 112 end_prolog 113 mov arg4, arg(4) 114 %endmacro 115 116 %macro FUNC_RESTORE 0 117 vmovdqa xmm6, [rsp + 0*16] 118 vmovdqa xmm7, [rsp + 1*16] 119 vmovdqa xmm8, [rsp + 2*16] 120 vmovdqa xmm9, [rsp + 3*16] 121 vmovdqa xmm10, [rsp + 4*16] 122 vmovdqa xmm11, [rsp + 5*16] 123 vmovdqa xmm12, [rsp + 6*16] 124 mov r12, [rsp + 7*16 + 0*8] 125 mov r13, [rsp + 7*16 + 1*8] 126 mov r14, [rsp + 7*16 + 2*8] 127 mov r15, [rsp + 7*16 + 3*8] 128 mov rdi, [rsp + 7*16 + 4*8] 129 mov rsi, [rsp + 7*16 + 5*8] 130 mov rbp, [rsp + 7*16 + 6*8] 131 mov rbx, [rsp + 7*16 + 7*8] 132 add rsp, stack_size 133 %endmacro 134%endif 135 136 137%define len arg0 138%define vec arg1 139%define mul_array arg2 140%define src arg3 141%define dest1 arg4 142%define ptr arg5 143%define vec_i tmp2 144%define dest2 tmp3 145%define dest3 tmp4 146%define dest4 tmp5 147%define vskip3 tmp6 148%define dest5 tmp7 149%define vskip5 tmp8 150%define pos rax 151 152 153%ifndef EC_ALIGNED_ADDR 154;;; Use Un-aligned load/store 155 %define XLDR vmovdqu8 156 %define XSTR vmovdqu8 157%else 158;;; Use Non-temporal load/stor 159 %ifdef NO_NT_LDST 160 %define XLDR vmovdqa64 161 %define XSTR vmovdqa64 162 %else 163 %define XLDR vmovntdqa 164 %define XSTR vmovntdq 165 %endif 166%endif 167 168%define xgft1 zmm7 169%define xgft2 zmm8 170%define xgft3 zmm9 171%define xgft4 zmm10 172%define xgft5 zmm11 173%define xgft6 zmm12 174 175%define x0 zmm0 176%define xp1 zmm1 177%define xp2 zmm2 178%define xp3 zmm3 179%define xp4 zmm4 180%define xp5 zmm5 181%define xp6 zmm6 182 183default rel 184[bits 64] 185 186section .text 187 188;; 189;; Encodes 64 bytes of all "k" sources into 6x 64 bytes (parity disks) 190;; 191%macro ENCODE_64B_6 0-1 192%define %%KMASK %1 193 194 vpxorq xp1, xp1, xp1 195 vpxorq xp2, xp2, xp2 196 vpxorq xp3, xp3, xp3 197 vpxorq xp4, xp4, xp4 198 vpxorq xp5, xp5, xp5 199 vpxorq xp6, xp6, xp6 200 mov tmp, mul_array 201 xor vec_i, vec_i 202 203%%next_vect: 204 mov ptr, [src + vec_i] 205%if %0 == 1 206 vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes) 207%else 208 XLDR x0, [ptr + pos] ;Get next source vector (64 bytes) 209%endif 210 add vec_i, 8 211 212 vbroadcastf32x2 xgft1, [tmp] 213 vbroadcastf32x2 xgft2, [tmp + vec] 214 vbroadcastf32x2 xgft3, [tmp + vec*2] 215 vbroadcastf32x2 xgft4, [tmp + vskip3] 216 vbroadcastf32x2 xgft5, [tmp + vec*4] 217 vbroadcastf32x2 xgft6, [tmp + vskip5] 218 add tmp, 8 219 220 GF_MUL_XOR EVEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3, \ 221 xgft4, xgft4, xp4, xgft5, xgft5, xp5, xgft6, xgft6, xp6 222 223 cmp vec_i, vec 224 jl %%next_vect 225 226 mov ptr, [dest1] ;reuse ptr 227 mov tmp, [dest1 + 5*8] ;reuse tmp 228 229%if %0 == 1 230 vmovdqu8 [dest2 + pos]{%%KMASK}, xp2 231 vmovdqu8 [dest3 + pos]{%%KMASK}, xp3 232 vmovdqu8 [dest4 + pos]{%%KMASK}, xp4 233 vmovdqu8 [dest5 + pos]{%%KMASK}, xp5 234 vmovdqu8 [ptr + pos]{%%KMASK}, xp1 ; dest 1 235 vmovdqu8 [tmp + pos]{%%KMASK}, xp6 ; dest 6 236%else 237 XSTR [dest2 + pos], xp2 238 XSTR [dest3 + pos], xp3 239 XSTR [dest4 + pos], xp4 240 XSTR [dest5 + pos], xp5 241 XSTR [ptr + pos], xp1 ; dest 1 242 XSTR [tmp + pos], xp6 ; dest 6 243%endif 244%endmacro 245 246align 16 247mk_global gf_6vect_dot_prod_avx512_gfni, function 248func(gf_6vect_dot_prod_avx512_gfni) 249 FUNC_SAVE 250 251 xor pos, pos 252 mov vskip3, vec 253 imul vskip3, 3*8 254 mov vskip5, vec 255 imul vskip5, 5*8 256 shl vec, 3 ;vec *= 8. Make vec_i count by 8 257 mov dest2, [dest1 + 8] 258 mov dest3, [dest1 + 2*8] 259 mov dest4, [dest1 + 3*8] 260 mov dest5, [dest1 + 4*8] ;dest1 and dest6 are calculated later 261 262 cmp len, 64 263 jl .len_lt_64 264 265.loop64: 266 267 ENCODE_64B_6 268 269 add pos, 64 ;Loop on 64 bytes at a time 270 sub len, 64 271 cmp len, 64 272 jge .loop64 273 274.len_lt_64: 275 cmp len, 0 276 jle .exit 277 278 xor tmp, tmp 279 bts tmp, len 280 dec tmp 281 kmovq k1, tmp 282 283 ENCODE_64B_6 k1 284 285.exit: 286 vzeroupper 287 288 FUNC_RESTORE 289 ret 290 291endproc_frame 292%endif ; if AS_FEATURE_LEVEL >= 10 293