1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2017 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len) 31 32%define LIMIT 5552 33%define BASE 0xFFF1 ; 65521 34 35%define CHUNKSIZE 16 36%define CHUNKSIZE_M1 (CHUNKSIZE-1) 37 38%include "reg_sizes.asm" 39 40default rel 41[bits 64] 42 43; need to keep free: eax, ecx, edx 44 45%ifidn __OUTPUT_FORMAT__, elf64 46 %define arg1 rdi 47 %define arg2 rsi 48 %define arg3 rdx 49 50 %define init_d edi 51 %define data r9 52 %define size r10 53 %define s r11 54 %define a_d r12d 55 %define b_d r8d 56 %define end r13 57 58 %define func(x) x: endbranch 59 %macro FUNC_SAVE 0 60 push r12 61 push r13 62 %endmacro 63 %macro FUNC_RESTORE 0 64 pop r13 65 pop r12 66 %endmacro 67%endif 68 69%ifidn __OUTPUT_FORMAT__, win64 70 %define arg1 rcx 71 %define arg2 rdx 72 %define arg3 r8 73 74 %define init_d r12d 75 %define data r9 76 %define size r10 77 %define s r11 78 %define a_d esi 79 %define b_d edi 80 %define end r13 81 82 %define stack_size 2*16 + 5*8 ; must be an odd multiple of 8 83 %define arg(x) [rsp + stack_size + PS + PS*x] 84 %define func(x) proc_frame x 85 %macro FUNC_SAVE 0 86 alloc_stack stack_size 87 vmovdqa [rsp + 0*16], xmm6 88 vmovdqa [rsp + 1*16], xmm7 89 save_reg rdi, 2*16 + 0*8 90 save_reg rsi, 2*16 + 1*8 91 save_reg r12, 2*16 + 2*8 92 save_reg r13, 2*16 + 3*8 93 end_prolog 94 mov init_d, ecx ; initialize init_d from arg1 to keep ecx free 95 %endmacro 96 97 %macro FUNC_RESTORE 0 98 vmovdqa xmm6, [rsp + 0*16] 99 vmovdqa xmm7, [rsp + 1*16] 100 mov rdi, [rsp + 2*16 + 0*8] 101 mov rsi, [rsp + 2*16 + 1*8] 102 mov r12, [rsp + 2*16 + 2*8] 103 mov r13, [rsp + 2*16 + 3*8] 104 add rsp, stack_size 105 %endmacro 106%endif 107 108%define ya ymm0 109%define yb ymm1 110%define ydata0 ymm2 111%define ydata1 ymm3 112%define ysa ymm4 113%define ydata ysa 114%define ytmp0 ydata0 115%define ytmp1 ydata1 116%define ytmp2 ymm5 117%define xa xmm0 118%define xb xmm1 119%define xtmp0 xmm2 120%define xtmp1 xmm3 121%define xsa xmm4 122%define xtmp2 xmm5 123%define yshuf0 ymm6 124%define yshuf1 ymm7 125 126[bits 64] 127default rel 128section .text 129 130mk_global adler32_avx2_4, function 131func(adler32_avx2_4) 132 FUNC_SAVE 133 134 vmovdqa yshuf0, [SHUF0] 135 vmovdqa yshuf1, [SHUF1] 136 137 mov data, arg2 138 mov size, arg3 139 140 mov b_d, init_d 141 shr b_d, 16 142 and init_d, 0xFFFF 143 cmp size, 32 144 jb .lt64 145 vmovd xa, init_d 146 vpxor yb, yb, yb 147.sloop1: 148 mov s, LIMIT 149 cmp s, size 150 cmova s, size ; s = min(size, LIMIT) 151 lea end, [data + s - CHUNKSIZE_M1] 152 cmp data, end 153 jae .skip_loop_1a 154align 32 155.sloop1a: 156 ; do CHUNKSIZE adds 157 vbroadcastf128 ydata, [data] 158 add data, CHUNKSIZE 159 vpshufb ydata0, ydata, yshuf0 160 vpaddd ya, ya, ydata0 161 vpaddd yb, yb, ya 162 vpshufb ydata1, ydata, yshuf1 163 vpaddd ya, ya, ydata1 164 vpaddd yb, yb, ya 165 cmp data, end 166 jb .sloop1a 167 168.skip_loop_1a: 169 add end, CHUNKSIZE_M1 170 171 test s, CHUNKSIZE_M1 172 jnz .do_final 173 174 ; either we're done, or we just did LIMIT 175 sub size, s 176 177 ; reduce 178 vpslld yb, 3 ; b is scaled by 8 179 vpmulld ysa, ya, [A_SCALE] ; scaled a 180 181 ; compute horizontal sums of ya, yb, ysa 182 vextracti128 xtmp0, ya, 1 183 vextracti128 xtmp1, yb, 1 184 vextracti128 xtmp2, ysa, 1 185 vpaddd xa, xa, xtmp0 186 vpaddd xb, xb, xtmp1 187 vpaddd xsa, xsa, xtmp2 188 vphaddd xa, xa, xa 189 vphaddd xb, xb, xb 190 vphaddd xsa, xsa, xsa 191 vphaddd xa, xa, xa 192 vphaddd xb, xb, xb 193 vphaddd xsa, xsa, xsa 194 195 vmovd eax, xa 196 xor edx, edx 197 mov ecx, BASE 198 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx 199 mov a_d, edx 200 201 vpsubd xb, xb, xsa 202 vmovd eax, xb 203 add eax, b_d 204 xor edx, edx 205 mov ecx, BASE 206 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx 207 mov b_d, edx 208 209 test size, size 210 jz .finish 211 212 ; continue loop 213 vmovd xa, a_d 214 vpxor yb, yb 215 jmp .sloop1 216 217.finish: 218 mov eax, b_d 219 shl eax, 16 220 or eax, a_d 221 jmp .end 222 223.lt64: 224 mov a_d, init_d 225 lea end, [data + size] 226 test size, size 227 jnz .final_loop 228 jmp .zero_size 229 230 ; handle remaining 1...15 bytes 231.do_final: 232 ; reduce 233 vpslld yb, 3 ; b is scaled by 8 234 vpmulld ysa, ya, [A_SCALE] ; scaled a 235 236 vextracti128 xtmp0, ya, 1 237 vextracti128 xtmp1, yb, 1 238 vextracti128 xtmp2, ysa, 1 239 vpaddd xa, xa, xtmp0 240 vpaddd xb, xb, xtmp1 241 vpaddd xsa, xsa, xtmp2 242 vphaddd xa, xa, xa 243 vphaddd xb, xb, xb 244 vphaddd xsa, xsa, xsa 245 vphaddd xa, xa, xa 246 vphaddd xb, xb, xb 247 vphaddd xsa, xsa, xsa 248 vpsubd xb, xb, xsa 249 250 vmovd a_d, xa 251 vmovd eax, xb 252 add b_d, eax 253 254align 32 255.final_loop: 256 movzx eax, byte[data] 257 add a_d, eax 258 inc data 259 add b_d, a_d 260 cmp data, end 261 jb .final_loop 262 263.zero_size: 264 mov eax, a_d 265 xor edx, edx 266 mov ecx, BASE 267 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx 268 mov a_d, edx 269 270 mov eax, b_d 271 xor edx, edx 272 mov ecx, BASE 273 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx 274 shl edx, 16 275 or edx, a_d 276 mov eax, edx 277 278.end: 279 FUNC_RESTORE 280 ret 281 282endproc_frame 283 284section .data 285align 32 286A_SCALE: 287 dq 0x0000000100000000, 0x0000000300000002 288 dq 0x0000000500000004, 0x0000000700000006 289SHUF0: 290 dq 0xFFFFFF01FFFFFF00, 0xFFFFFF03FFFFFF02 291 dq 0xFFFFFF05FFFFFF04, 0xFFFFFF07FFFFFF06 292SHUF1: 293 dq 0xFFFFFF09FFFFFF08, 0xFFFFFF0BFFFFFF0A 294 dq 0xFFFFFF0DFFFFFF0C, 0xFFFFFF0FFFFFFF0E 295 296