1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2015 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30;;; 31;;; gf_vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, *dest); 32;;; 33 34%include "reg_sizes.asm" 35 36%ifidn __OUTPUT_FORMAT__, elf64 37 %define arg0 rdi 38 %define arg1 rsi 39 %define arg2 rdx 40 %define arg3 rcx 41 %define arg4 r8 42 %define arg5 r9 43 44 %define tmp r11 45 %define tmp.w r11d 46 %define tmp.b r11b 47 %define tmp2 r10 48 %define tmp3 r9 49 %define return rax 50 %macro SLDR 2 51 %endmacro 52 %define SSTR SLDR 53 %define PS 8 54 %define func(x) x: endbranch 55 %define FUNC_SAVE 56 %define FUNC_RESTORE 57%endif 58 59%ifidn __OUTPUT_FORMAT__, win64 60 %define arg0 rcx 61 %define arg1 rdx 62 %define arg2 r8 63 %define arg3 r9 64 65 %define arg4 r12 ; must be saved and loaded 66 %define tmp r11 67 %define tmp.w r11d 68 %define tmp.b r11b 69 %define tmp2 r10 70 %define tmp3 rdi ; must be saved and loaded 71 %define return rax 72 %macro SLDR 2 73 %endmacro 74 %define SSTR SLDR 75 %define PS 8 76 %define frame_size 2*8 77 %define arg(x) [rsp + frame_size + PS + PS*x] 78 79 %define func(x) proc_frame x 80 %macro FUNC_SAVE 0 81 rex_push_reg r12 82 push_reg rdi 83 end_prolog 84 mov arg4, arg(4) 85 %endmacro 86 87 %macro FUNC_RESTORE 0 88 pop rdi 89 pop r12 90 %endmacro 91%endif 92 93%ifidn __OUTPUT_FORMAT__, elf32 94 95;;;================== High Address; 96;;; arg4 97;;; arg3 98;;; arg2 99;;; arg1 100;;; arg0 101;;; return 102;;;<================= esp of caller 103;;; ebp 104;;;<================= ebp = esp 105;;; esi 106;;; edi 107;;; ebx 108;;;<================= esp of callee 109;;; 110;;;================== Low Address; 111 112 %define PS 4 113 %define LOG_PS 2 114 %define func(x) x: endbranch 115 %define arg(x) [ebp + PS*2 + PS*x] 116 117 %define trans ecx ;trans is for the variables in stack 118 %define arg0 trans 119 %define arg0_m arg(0) 120 %define arg1 trans 121 %define arg1_m arg(1) 122 %define arg2 arg2_m 123 %define arg2_m arg(2) 124 %define arg3 ebx 125 %define arg4 trans 126 %define arg4_m arg(4) 127 %define tmp edx 128 %define tmp.w edx 129 %define tmp.b dl 130 %define tmp2 edi 131 %define tmp3 esi 132 %define return eax 133 %macro SLDR 2 ;stack load/restore 134 mov %1, %2 135 %endmacro 136 %define SSTR SLDR 137 138 %macro FUNC_SAVE 0 139 push ebp 140 mov ebp, esp 141 push esi 142 push edi 143 push ebx 144 mov arg3, arg(3) 145 %endmacro 146 147 %macro FUNC_RESTORE 0 148 pop ebx 149 pop edi 150 pop esi 151 mov esp, ebp 152 pop ebp 153 %endmacro 154 155%endif ; output formats 156 157%define len arg0 158%define vec arg1 159%define mul_array arg2 160%define src arg3 161%define dest arg4 162 163%define vec_i tmp2 164%define ptr tmp3 165%define pos return 166 167%ifidn PS,4 ;32-bit code 168 %define vec_m arg1_m 169 %define len_m arg0_m 170 %define dest_m arg4_m 171%endif 172 173%ifndef EC_ALIGNED_ADDR 174;;; Use Un-aligned load/store 175 %define XLDR vmovdqu 176 %define XSTR vmovdqu 177%else 178;;; Use Non-temporal load/stor 179 %ifdef NO_NT_LDST 180 %define XLDR vmovdqa 181 %define XSTR vmovdqa 182 %else 183 %define XLDR vmovntdqa 184 %define XSTR vmovntdq 185 %endif 186%endif 187 188%ifidn PS,8 ;64-bit code 189 default rel 190 [bits 64] 191%endif 192 193section .text 194 195%define xmask0f ymm3 196%define xmask0fx xmm3 197%define xgft_lo ymm4 198%define xgft_hi ymm5 199 200%define x0 ymm0 201%define xtmpa ymm1 202%define xp ymm2 203 204align 16 205mk_global gf_vect_dot_prod_avx2, function 206func(gf_vect_dot_prod_avx2) 207 FUNC_SAVE 208 SLDR len, len_m 209 sub len, 32 210 SSTR len_m, len 211 jl .return_fail 212 xor pos, pos 213 mov tmp.b, 0x0f 214 vpinsrb xmask0fx, xmask0fx, tmp.w, 0 215 vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... 216 217.loop32: 218 vpxor xp, xp 219 mov tmp, mul_array 220 xor vec_i, vec_i 221 222.next_vect: 223 224 mov ptr, [src+vec_i*PS] 225 226 vmovdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, Cx{02}, ... 227 ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} 228 vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi 229 vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo 230 231 XLDR x0, [ptr+pos] ;Get next source vector 232 233 add tmp, 32 234 add vec_i, 1 235 236 vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 237 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 238 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 239 240 vpshufb xgft_hi, xgft_hi, x0 ;Lookup mul table of high nibble 241 vpshufb xgft_lo, xgft_lo, xtmpa ;Lookup mul table of low nibble 242 vpxor xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials 243 vpxor xp, xp, xgft_hi ;xp += partial 244 245 SLDR vec, vec_m 246 cmp vec_i, vec 247 jl .next_vect 248 249 SLDR dest, dest_m 250 XSTR [dest+pos], xp 251 252 add pos, 32 ;Loop on 32 bytes at a time 253 SLDR len, len_m 254 cmp pos, len 255 jle .loop32 256 257 lea tmp, [len + 32] 258 cmp pos, tmp 259 je .return_pass 260 261 ;; Tail len 262 mov pos, len ;Overlapped offset length-32 263 jmp .loop32 ;Do one more overlap pass 264 265.return_pass: 266 mov return, 0 267 FUNC_RESTORE 268 ret 269 270.return_fail: 271 mov return, 1 272 FUNC_RESTORE 273 ret 274 275endproc_frame 276 277section .data 278 279;;; func core, ver, snum 280slversion gf_vect_dot_prod_avx2, 04, 05, 0190 281