1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2019 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30%ifndef _MULTIBINARY_ASM_ 31%define _MULTIBINARY_ASM_ 32 33%ifidn __OUTPUT_FORMAT__, elf32 34 %define mbin_def_ptr dd 35 %define mbin_ptr_sz dword 36 %define mbin_rdi edi 37 %define mbin_rsi esi 38 %define mbin_rax eax 39 %define mbin_rbx ebx 40 %define mbin_rcx ecx 41 %define mbin_rdx edx 42%else 43 %define mbin_def_ptr dq 44 %define mbin_ptr_sz qword 45 %define mbin_rdi rdi 46 %define mbin_rsi rsi 47 %define mbin_rax rax 48 %define mbin_rbx rbx 49 %define mbin_rcx rcx 50 %define mbin_rdx rdx 51%endif 52 53%ifndef AS_FEATURE_LEVEL 54%define AS_FEATURE_LEVEL 4 55%endif 56 57;;;; 58; multibinary macro: 59; creates the visible entry point that uses HW optimized call pointer 60; creates the init of the HW optimized call pointer 61;;;; 62%macro mbin_interface 1 63 ;;;; 64 ; *_dispatched is defaulted to *_mbinit and replaced on first call. 65 ; Therefore, *_dispatch_init is only executed on first call. 66 ;;;; 67 section .data 68 %1_dispatched: 69 mbin_def_ptr %1_mbinit 70 71 section .text 72 mk_global %1, function 73 %1_mbinit: 74 ;;; only called the first time to setup hardware match 75 call %1_dispatch_init 76 ;;; falls thru to execute the hw optimized code 77 %1: 78 jmp mbin_ptr_sz [%1_dispatched] 79%endmacro 80 81;;;;; 82; mbin_dispatch_init parameters 83; Use this function when SSE/00/01 is a minimum requirement 84; 1-> function name 85; 2-> SSE/00/01 optimized function used as base 86; 3-> AVX or AVX/02 opt func 87; 4-> AVX2 or AVX/04 opt func 88;;;;; 89%macro mbin_dispatch_init 4 90 section .text 91 %1_dispatch_init: 92 push mbin_rsi 93 push mbin_rax 94 push mbin_rbx 95 push mbin_rcx 96 push mbin_rdx 97 lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01 98 99 mov eax, 1 100 cpuid 101 and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 102 cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 103 lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func 104 jne _%1_init_done ; AVX is not available so end 105 mov mbin_rsi, mbin_rbx 106 107 ;; Try for AVX2 108 xor ecx, ecx 109 mov eax, 7 110 cpuid 111 test ebx, FLAG_CPUID7_EBX_AVX2 112 lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func 113 cmovne mbin_rsi, mbin_rbx 114 115 ;; Does it have xmm and ymm support 116 xor ecx, ecx 117 xgetbv 118 and eax, FLAG_XGETBV_EAX_XMM_YMM 119 cmp eax, FLAG_XGETBV_EAX_XMM_YMM 120 je _%1_init_done 121 lea mbin_rsi, [%2 WRT_OPT] 122 123 _%1_init_done: 124 pop mbin_rdx 125 pop mbin_rcx 126 pop mbin_rbx 127 pop mbin_rax 128 mov [%1_dispatched], mbin_rsi 129 pop mbin_rsi 130 ret 131%endmacro 132 133;;;;; 134; mbin_dispatch_init2 parameters 135; Cases where only base functions are available 136; 1-> function name 137; 2-> base function 138;;;;; 139%macro mbin_dispatch_init2 2 140 section .text 141 %1_dispatch_init: 142 push mbin_rsi 143 lea mbin_rsi, [%2 WRT_OPT] ; Default 144 mov [%1_dispatched], mbin_rsi 145 pop mbin_rsi 146 ret 147%endmacro 148 149;;;;; 150; mbin_dispatch_init5 parameters 151; 1-> function name 152; 2-> base function 153; 3-> SSE4_1 or 00/01 optimized function 154; 4-> AVX/02 opt func 155; 5-> AVX2/04 opt func 156;;;;; 157%macro mbin_dispatch_init5 5 158 section .text 159 %1_dispatch_init: 160 push mbin_rsi 161 push mbin_rax 162 push mbin_rbx 163 push mbin_rcx 164 push mbin_rdx 165 lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function 166 167 mov eax, 1 168 cpuid 169 ; Test for SSE4.1 170 test ecx, FLAG_CPUID1_ECX_SSE4_1 171 lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func 172 cmovne mbin_rsi, mbin_rbx 173 174 and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 175 cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 176 lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func 177 jne _%1_init_done ; AVX is not available so end 178 mov mbin_rsi, mbin_rbx 179 180 ;; Try for AVX2 181 xor ecx, ecx 182 mov eax, 7 183 cpuid 184 test ebx, FLAG_CPUID7_EBX_AVX2 185 lea mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func 186 cmovne mbin_rsi, mbin_rbx 187 188 ;; Does it have xmm and ymm support 189 xor ecx, ecx 190 xgetbv 191 and eax, FLAG_XGETBV_EAX_XMM_YMM 192 cmp eax, FLAG_XGETBV_EAX_XMM_YMM 193 je _%1_init_done 194 lea mbin_rsi, [%3 WRT_OPT] 195 196 _%1_init_done: 197 pop mbin_rdx 198 pop mbin_rcx 199 pop mbin_rbx 200 pop mbin_rax 201 mov [%1_dispatched], mbin_rsi 202 pop mbin_rsi 203 ret 204%endmacro 205 206%if AS_FEATURE_LEVEL >= 6 207;;;;; 208; mbin_dispatch_init6 parameters 209; 1-> function name 210; 2-> base function 211; 3-> SSE4_1 or 00/01 optimized function 212; 4-> AVX/02 opt func 213; 5-> AVX2/04 opt func 214; 6-> AVX512/06 opt func 215;;;;; 216%macro mbin_dispatch_init6 6 217 section .text 218 %1_dispatch_init: 219 push mbin_rsi 220 push mbin_rax 221 push mbin_rbx 222 push mbin_rcx 223 push mbin_rdx 224 push mbin_rdi 225 lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function 226 227 mov eax, 1 228 cpuid 229 mov ebx, ecx ; save cpuid1.ecx 230 test ecx, FLAG_CPUID1_ECX_SSE4_1 231 je _%1_init_done ; Use base function if no SSE4_1 232 lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt 233 234 ;; Test for XMM_YMM support/AVX 235 test ecx, FLAG_CPUID1_ECX_OSXSAVE 236 je _%1_init_done 237 xor ecx, ecx 238 xgetbv ; xcr -> edx:eax 239 mov edi, eax ; save xgetvb.eax 240 241 and eax, FLAG_XGETBV_EAX_XMM_YMM 242 cmp eax, FLAG_XGETBV_EAX_XMM_YMM 243 jne _%1_init_done 244 test ebx, FLAG_CPUID1_ECX_AVX 245 je _%1_init_done 246 lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt 247 248 ;; Test for AVX2 249 xor ecx, ecx 250 mov eax, 7 251 cpuid 252 test ebx, FLAG_CPUID7_EBX_AVX2 253 je _%1_init_done ; No AVX2 possible 254 lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func 255 256 ;; Test for AVX512 257 and edi, FLAG_XGETBV_EAX_ZMM_OPM 258 cmp edi, FLAG_XGETBV_EAX_ZMM_OPM 259 jne _%1_init_done ; No AVX512 possible 260 and ebx, FLAGS_CPUID7_EBX_AVX512_G1 261 cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 262 lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt 263 cmove mbin_rsi, mbin_rbx 264 265 _%1_init_done: 266 pop mbin_rdi 267 pop mbin_rdx 268 pop mbin_rcx 269 pop mbin_rbx 270 pop mbin_rax 271 mov [%1_dispatched], mbin_rsi 272 pop mbin_rsi 273 ret 274%endmacro 275 276%else 277%macro mbin_dispatch_init6 6 278 mbin_dispatch_init5 %1, %2, %3, %4, %5 279%endmacro 280%endif 281 282%if AS_FEATURE_LEVEL >= 10 283;;;;; 284; mbin_dispatch_init7 parameters 285; 1-> function name 286; 2-> base function 287; 3-> SSE4_2 or 00/01 optimized function 288; 4-> AVX/02 opt func 289; 5-> AVX2/04 opt func 290; 6-> AVX512/06 opt func 291; 7-> AVX512 Update/10 opt func 292;;;;; 293%macro mbin_dispatch_init7 7 294 section .text 295 %1_dispatch_init: 296 push mbin_rsi 297 push mbin_rax 298 push mbin_rbx 299 push mbin_rcx 300 push mbin_rdx 301 push mbin_rdi 302 lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function 303 304 mov eax, 1 305 cpuid 306 mov ebx, ecx ; save cpuid1.ecx 307 test ecx, FLAG_CPUID1_ECX_SSE4_2 308 je _%1_init_done ; Use base function if no SSE4_2 309 lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt 310 311 ;; Test for XMM_YMM support/AVX 312 test ecx, FLAG_CPUID1_ECX_OSXSAVE 313 je _%1_init_done 314 xor ecx, ecx 315 xgetbv ; xcr -> edx:eax 316 mov edi, eax ; save xgetvb.eax 317 318 and eax, FLAG_XGETBV_EAX_XMM_YMM 319 cmp eax, FLAG_XGETBV_EAX_XMM_YMM 320 jne _%1_init_done 321 test ebx, FLAG_CPUID1_ECX_AVX 322 je _%1_init_done 323 lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt 324 325 ;; Test for AVX2 326 xor ecx, ecx 327 mov eax, 7 328 cpuid 329 test ebx, FLAG_CPUID7_EBX_AVX2 330 je _%1_init_done ; No AVX2 possible 331 lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func 332 333 ;; Test for AVX512 334 and edi, FLAG_XGETBV_EAX_ZMM_OPM 335 cmp edi, FLAG_XGETBV_EAX_ZMM_OPM 336 jne _%1_init_done ; No AVX512 possible 337 and ebx, FLAGS_CPUID7_EBX_AVX512_G1 338 cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 339 lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt 340 cmove mbin_rsi, mbin_rbx 341 342 and ecx, FLAGS_CPUID7_ECX_AVX512_G2 343 cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2 344 lea mbin_rbx, [%7 WRT_OPT] ; AVX512/06 opt 345 cmove mbin_rsi, mbin_rbx 346 347 _%1_init_done: 348 pop mbin_rdi 349 pop mbin_rdx 350 pop mbin_rcx 351 pop mbin_rbx 352 pop mbin_rax 353 mov [%1_dispatched], mbin_rsi 354 pop mbin_rsi 355 ret 356%endmacro 357%else 358%macro mbin_dispatch_init7 7 359 mbin_dispatch_init6 %1, %2, %3, %4, %5, %6 360%endmacro 361%endif 362 363;;;;; 364; mbin_dispatch_sse_to_avx2_shani parameters 365; derived from mbin_dispatch_init 366; Use this function when SSE/00/01 is a minimum requirement 367; 1-> function name 368; 2-> SSE/00/01 optimized function used as base 369; 3-> AVX or AVX/02 opt func 370; 4-> AVX2 or AVX/04 opt func 371; 5-> SHANI opt for GLM 372;;;;; 373%macro mbin_dispatch_sse_to_avx2_shani 5 374 section .text 375 %1_dispatch_init: 376 push mbin_rsi 377 push mbin_rax 378 push mbin_rbx 379 push mbin_rcx 380 push mbin_rdx 381 lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01 382 383 mov eax, 1 384 cpuid 385 and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 386 cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 387 lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func 388 jne _%1_shani_check ; AVX is not available so check shani 389 mov mbin_rsi, mbin_rbx 390 391 ;; Try for AVX2 392 xor ecx, ecx 393 mov eax, 7 394 cpuid 395 test ebx, FLAG_CPUID7_EBX_AVX2 396 lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func 397 cmovne mbin_rsi, mbin_rbx 398 399 ;; Does it have xmm and ymm support 400 xor ecx, ecx 401 xgetbv 402 and eax, FLAG_XGETBV_EAX_XMM_YMM 403 cmp eax, FLAG_XGETBV_EAX_XMM_YMM 404 je _%1_init_done 405 lea mbin_rsi, [%2 WRT_OPT] 406 407 _%1_init_done: 408 pop mbin_rdx 409 pop mbin_rcx 410 pop mbin_rbx 411 pop mbin_rax 412 mov [%1_dispatched], mbin_rsi 413 pop mbin_rsi 414 ret 415 416 _%1_shani_check: 417 xor ecx, ecx 418 mov eax, 7 419 cpuid 420 test ebx, FLAG_CPUID7_EBX_SHA 421 lea mbin_rbx, [%5 WRT_OPT] ; SHANI opt func 422 cmovne mbin_rsi, mbin_rbx 423 jmp _%1_init_done ; end 424%endmacro 425 426;;;;; 427; mbin_dispatch_base_to_avx512_shani parameters 428; derived from mbin_dispatch_init6 429; 1-> function name 430; 2-> base function 431; 3-> SSE4_2 or 00/01 optimized function 432; 4-> AVX/02 opt func 433; 5-> AVX2/04 opt func 434; 6-> AVX512/06 opt func 435; 7-> SHANI opt for GLM 436; 8-> SHANI opt for CNL 437;;;;; 438%macro mbin_dispatch_base_to_avx512_shani 8 439 section .text 440 %1_dispatch_init: 441 push mbin_rsi 442 push mbin_rax 443 push mbin_rbx 444 push mbin_rcx 445 push mbin_rdx 446 push mbin_rdi 447 lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function 448 449 mov eax, 1 450 cpuid 451 mov ebx, ecx ; save cpuid1.ecx 452 test ecx, FLAG_CPUID1_ECX_SSE4_2 453 je _%1_init_done ; Use base function if no SSE4_2 454 lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt 455 456 ;; Test for XMM_YMM support/AVX 457 test ecx, FLAG_CPUID1_ECX_OSXSAVE 458 je _%1_shani_check 459 xor ecx, ecx 460 xgetbv ; xcr -> edx:eax 461 mov edi, eax ; save xgetvb.eax 462 463 and eax, FLAG_XGETBV_EAX_XMM_YMM 464 cmp eax, FLAG_XGETBV_EAX_XMM_YMM 465 jne _%1_shani_check 466 test ebx, FLAG_CPUID1_ECX_AVX 467 je _%1_shani_check 468 lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt 469 470 ;; Test for AVX2 471 xor ecx, ecx 472 mov eax, 7 473 cpuid 474 test ebx, FLAG_CPUID7_EBX_AVX2 475 je _%1_init_done ; No AVX2 possible 476 lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func 477 478 ;; Test for AVX512 479 and edi, FLAG_XGETBV_EAX_ZMM_OPM 480 cmp edi, FLAG_XGETBV_EAX_ZMM_OPM 481 jne _%1_init_done ; No AVX512 possible 482 and ebx, FLAGS_CPUID7_EBX_AVX512_G1 483 cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 484 lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt 485 cmove mbin_rsi, mbin_rbx 486 487 ;; Test for SHANI 488 xor ecx, ecx 489 mov eax, 7 490 cpuid 491 test ebx, FLAG_CPUID7_EBX_SHA 492 lea mbin_rbx, [%8 WRT_OPT] ; SHANI opt sse func 493 cmovne mbin_rsi, mbin_rbx 494 495 _%1_init_done: 496 pop mbin_rdi 497 pop mbin_rdx 498 pop mbin_rcx 499 pop mbin_rbx 500 pop mbin_rax 501 mov [%1_dispatched], mbin_rsi 502 pop mbin_rsi 503 ret 504 505 _%1_shani_check: 506 xor ecx, ecx 507 mov eax, 7 508 cpuid 509 test ebx, FLAG_CPUID7_EBX_SHA 510 lea mbin_rbx, [%7 WRT_OPT] ; SHANI opt sse func 511 cmovne mbin_rsi, mbin_rbx 512 jmp _%1_init_done ; end 513%endmacro 514 515 516 517%endif ; ifndef _MULTIBINARY_ASM_ 518