1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2015 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30%ifndef _MULTIBINARY_ASM_ 31%define _MULTIBINARY_ASM_ 32 33%ifidn __OUTPUT_FORMAT__, elf32 34 %define mbin_def_ptr dd 35 %define mbin_ptr_sz dword 36 %define mbin_rdi edi 37 %define mbin_rsi esi 38 %define mbin_rax eax 39 %define mbin_rbx ebx 40 %define mbin_rcx ecx 41 %define mbin_rdx edx 42%else 43 %define mbin_def_ptr dq 44 %define mbin_ptr_sz qword 45 %define mbin_rdi rdi 46 %define mbin_rsi rsi 47 %define mbin_rax rax 48 %define mbin_rbx rbx 49 %define mbin_rcx rcx 50 %define mbin_rdx rdx 51%endif 52 53%ifndef AS_FEATURE_LEVEL 54%define AS_FEATURE_LEVEL 4 55%endif 56 57;;;; 58; multibinary macro: 59; creates the visible entry point that uses HW optimized call pointer 60; creates the init of the HW optimized call pointer 61;;;; 62%macro mbin_interface 1 63 ;;;; 64 ; *_dispatched is defaulted to *_mbinit and replaced on first call. 65 ; Therefore, *_dispatch_init is only executed on first call. 66 ;;;; 67 section .data 68 %1_dispatched: 69 mbin_def_ptr %1_mbinit 70 71 section .text 72 mk_global %1, function 73 %1_mbinit: 74 endbranch 75 ;;; only called the first time to setup hardware match 76 call %1_dispatch_init 77 ;;; falls thru to execute the hw optimized code 78 %1: 79 endbranch 80 jmp mbin_ptr_sz [%1_dispatched] 81%endmacro 82 83;;;;; 84; mbin_dispatch_init parameters 85; Use this function when SSE/00/01 is a minimum requirement 86; 1-> function name 87; 2-> SSE/00/01 optimized function used as base 88; 3-> AVX or AVX/02 opt func 89; 4-> AVX2 or AVX/04 opt func 90;;;;; 91%macro mbin_dispatch_init 4 92 section .text 93 %1_dispatch_init: 94 push mbin_rsi 95 push mbin_rax 96 push mbin_rbx 97 push mbin_rcx 98 push mbin_rdx 99 lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01 100 101 mov eax, 1 102 cpuid 103 and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 104 cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 105 lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func 106 jne _%1_init_done ; AVX is not available so end 107 mov mbin_rsi, mbin_rbx 108 109 ;; Try for AVX2 110 xor ecx, ecx 111 mov eax, 7 112 cpuid 113 test ebx, FLAG_CPUID7_EBX_AVX2 114 lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func 115 cmovne mbin_rsi, mbin_rbx 116 117 ;; Does it have xmm and ymm support 118 xor ecx, ecx 119 xgetbv 120 and eax, FLAG_XGETBV_EAX_XMM_YMM 121 cmp eax, FLAG_XGETBV_EAX_XMM_YMM 122 je _%1_init_done 123 lea mbin_rsi, [%2 WRT_OPT] 124 125 _%1_init_done: 126 pop mbin_rdx 127 pop mbin_rcx 128 pop mbin_rbx 129 pop mbin_rax 130 mov [%1_dispatched], mbin_rsi 131 pop mbin_rsi 132 ret 133%endmacro 134 135;;;;; 136; mbin_dispatch_init2 parameters 137; Cases where only base functions are available 138; 1-> function name 139; 2-> base function 140;;;;; 141%macro mbin_dispatch_init2 2 142 section .text 143 %1_dispatch_init: 144 push mbin_rsi 145 lea mbin_rsi, [%2 WRT_OPT] ; Default 146 mov [%1_dispatched], mbin_rsi 147 pop mbin_rsi 148 ret 149%endmacro 150 151;;;;; 152; mbin_dispatch_init_clmul 3 parameters 153; Use this case for CRC which needs both SSE4_1 and CLMUL 154; 1-> function name 155; 2-> base function 156; 3-> SSE4_1 and CLMUL optimized function 157; 4-> AVX/02 opt func 158; 5-> AVX512/10 opt func 159;;;;; 160%macro mbin_dispatch_init_clmul 5 161 section .text 162 %1_dispatch_init: 163 push mbin_rsi 164 push mbin_rax 165 push mbin_rbx 166 push mbin_rcx 167 push mbin_rdx 168 push mbin_rdi 169 lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function 170 171 mov eax, 1 172 cpuid 173 mov ebx, ecx ; save cpuid1.ecx 174 test ecx, FLAG_CPUID1_ECX_SSE4_1 175 jz _%1_init_done 176 test ecx, FLAG_CPUID1_ECX_CLMUL 177 jz _%1_init_done 178 lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt 179 180 ;; Test for XMM_YMM support/AVX 181 test ecx, FLAG_CPUID1_ECX_OSXSAVE 182 je _%1_init_done 183 xor ecx, ecx 184 xgetbv ; xcr -> edx:eax 185 mov edi, eax ; save xgetvb.eax 186 187 and eax, FLAG_XGETBV_EAX_XMM_YMM 188 cmp eax, FLAG_XGETBV_EAX_XMM_YMM 189 jne _%1_init_done 190 test ebx, FLAG_CPUID1_ECX_AVX 191 je _%1_init_done 192 lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt 193 194%if AS_FEATURE_LEVEL >= 10 195 ;; Test for AVX2 196 xor ecx, ecx 197 mov eax, 7 198 cpuid 199 test ebx, FLAG_CPUID7_EBX_AVX2 200 je _%1_init_done ; No AVX2 possible 201 202 ;; Test for AVX512 203 and edi, FLAG_XGETBV_EAX_ZMM_OPM 204 cmp edi, FLAG_XGETBV_EAX_ZMM_OPM 205 jne _%1_init_done ; No AVX512 possible 206 and ebx, FLAGS_CPUID7_EBX_AVX512_G1 207 cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 208 jne _%1_init_done 209 210 and ecx, FLAGS_CPUID7_ECX_AVX512_G2 211 cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2 212 lea mbin_rbx, [%5 WRT_OPT] ; AVX512/10 opt 213 cmove mbin_rsi, mbin_rbx 214%endif 215 _%1_init_done: 216 pop mbin_rdi 217 pop mbin_rdx 218 pop mbin_rcx 219 pop mbin_rbx 220 pop mbin_rax 221 mov [%1_dispatched], mbin_rsi 222 pop mbin_rsi 223 ret 224%endmacro 225 226;;;;; 227; mbin_dispatch_init5 parameters 228; 1-> function name 229; 2-> base function 230; 3-> SSE4_2 or 00/01 optimized function 231; 4-> AVX/02 opt func 232; 5-> AVX2/04 opt func 233;;;;; 234%macro mbin_dispatch_init5 5 235 section .text 236 %1_dispatch_init: 237 push mbin_rsi 238 push mbin_rax 239 push mbin_rbx 240 push mbin_rcx 241 push mbin_rdx 242 lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function 243 244 mov eax, 1 245 cpuid 246 ; Test for SSE4.2 247 test ecx, FLAG_CPUID1_ECX_SSE4_2 248 lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func 249 cmovne mbin_rsi, mbin_rbx 250 251 and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 252 cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 253 lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func 254 jne _%1_init_done ; AVX is not available so end 255 mov mbin_rsi, mbin_rbx 256 257 ;; Try for AVX2 258 xor ecx, ecx 259 mov eax, 7 260 cpuid 261 test ebx, FLAG_CPUID7_EBX_AVX2 262 lea mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func 263 cmovne mbin_rsi, mbin_rbx 264 265 ;; Does it have xmm and ymm support 266 xor ecx, ecx 267 xgetbv 268 and eax, FLAG_XGETBV_EAX_XMM_YMM 269 cmp eax, FLAG_XGETBV_EAX_XMM_YMM 270 je _%1_init_done 271 lea mbin_rsi, [%3 WRT_OPT] 272 273 _%1_init_done: 274 pop mbin_rdx 275 pop mbin_rcx 276 pop mbin_rbx 277 pop mbin_rax 278 mov [%1_dispatched], mbin_rsi 279 pop mbin_rsi 280 ret 281%endmacro 282 283%if AS_FEATURE_LEVEL >= 6 284;;;;; 285; mbin_dispatch_init6 parameters 286; 1-> function name 287; 2-> base function 288; 3-> SSE4_2 or 00/01 optimized function 289; 4-> AVX/02 opt func 290; 5-> AVX2/04 opt func 291; 6-> AVX512/06 opt func 292;;;;; 293%macro mbin_dispatch_init6 6 294 section .text 295 %1_dispatch_init: 296 push mbin_rsi 297 push mbin_rax 298 push mbin_rbx 299 push mbin_rcx 300 push mbin_rdx 301 push mbin_rdi 302 lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function 303 304 mov eax, 1 305 cpuid 306 mov ebx, ecx ; save cpuid1.ecx 307 test ecx, FLAG_CPUID1_ECX_SSE4_2 308 je _%1_init_done ; Use base function if no SSE4_2 309 lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt 310 311 ;; Test for XMM_YMM support/AVX 312 test ecx, FLAG_CPUID1_ECX_OSXSAVE 313 je _%1_init_done 314 xor ecx, ecx 315 xgetbv ; xcr -> edx:eax 316 mov edi, eax ; save xgetvb.eax 317 318 and eax, FLAG_XGETBV_EAX_XMM_YMM 319 cmp eax, FLAG_XGETBV_EAX_XMM_YMM 320 jne _%1_init_done 321 test ebx, FLAG_CPUID1_ECX_AVX 322 je _%1_init_done 323 lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt 324 325 ;; Test for AVX2 326 xor ecx, ecx 327 mov eax, 7 328 cpuid 329 test ebx, FLAG_CPUID7_EBX_AVX2 330 je _%1_init_done ; No AVX2 possible 331 lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func 332 333 ;; Test for AVX512 334 and edi, FLAG_XGETBV_EAX_ZMM_OPM 335 cmp edi, FLAG_XGETBV_EAX_ZMM_OPM 336 jne _%1_init_done ; No AVX512 possible 337 and ebx, FLAGS_CPUID7_EBX_AVX512_G1 338 cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 339 lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt 340 cmove mbin_rsi, mbin_rbx 341 342 _%1_init_done: 343 pop mbin_rdi 344 pop mbin_rdx 345 pop mbin_rcx 346 pop mbin_rbx 347 pop mbin_rax 348 mov [%1_dispatched], mbin_rsi 349 pop mbin_rsi 350 ret 351%endmacro 352 353%else 354%macro mbin_dispatch_init6 6 355 mbin_dispatch_init5 %1, %2, %3, %4, %5 356%endmacro 357%endif 358 359%if AS_FEATURE_LEVEL >= 10 360;;;;; 361; mbin_dispatch_init7 parameters 362; 1-> function name 363; 2-> base function 364; 3-> SSE4_2 or 00/01 optimized function 365; 4-> AVX/02 opt func 366; 5-> AVX2/04 opt func 367; 6-> AVX512/06 opt func 368; 7-> AVX512 Update/10 opt func 369;;;;; 370%macro mbin_dispatch_init7 7 371 section .text 372 %1_dispatch_init: 373 push mbin_rsi 374 push mbin_rax 375 push mbin_rbx 376 push mbin_rcx 377 push mbin_rdx 378 push mbin_rdi 379 lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function 380 381 mov eax, 1 382 cpuid 383 mov ebx, ecx ; save cpuid1.ecx 384 test ecx, FLAG_CPUID1_ECX_SSE4_2 385 je _%1_init_done ; Use base function if no SSE4_2 386 lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt 387 388 ;; Test for XMM_YMM support/AVX 389 test ecx, FLAG_CPUID1_ECX_OSXSAVE 390 je _%1_init_done 391 xor ecx, ecx 392 xgetbv ; xcr -> edx:eax 393 mov edi, eax ; save xgetvb.eax 394 395 and eax, FLAG_XGETBV_EAX_XMM_YMM 396 cmp eax, FLAG_XGETBV_EAX_XMM_YMM 397 jne _%1_init_done 398 test ebx, FLAG_CPUID1_ECX_AVX 399 je _%1_init_done 400 lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt 401 402 ;; Test for AVX2 403 xor ecx, ecx 404 mov eax, 7 405 cpuid 406 test ebx, FLAG_CPUID7_EBX_AVX2 407 je _%1_init_done ; No AVX2 possible 408 lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func 409 410 ;; Test for AVX512 411 and edi, FLAG_XGETBV_EAX_ZMM_OPM 412 cmp edi, FLAG_XGETBV_EAX_ZMM_OPM 413 jne _%1_init_done ; No AVX512 possible 414 and ebx, FLAGS_CPUID7_EBX_AVX512_G1 415 cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 416 lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt 417 cmove mbin_rsi, mbin_rbx 418 419 and ecx, FLAGS_CPUID7_ECX_AVX512_G2 420 cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2 421 lea mbin_rbx, [%7 WRT_OPT] ; AVX512/06 opt 422 cmove mbin_rsi, mbin_rbx 423 424 _%1_init_done: 425 pop mbin_rdi 426 pop mbin_rdx 427 pop mbin_rcx 428 pop mbin_rbx 429 pop mbin_rax 430 mov [%1_dispatched], mbin_rsi 431 pop mbin_rsi 432 ret 433%endmacro 434 435;;;;; 436; mbin_dispatch_init8 parameters 437; 1-> function name 438; 2-> base function 439; 3-> SSE4_2 or 00/01 optimized function 440; 4-> AVX/02 opt func 441; 5-> AVX2/04 opt func 442; 6-> AVX512/06 opt func 443; 7-> AVX2 Update/07 opt func 444; 8-> AVX512 Update/10 opt func 445;;;;; 446%macro mbin_dispatch_init8 8 447 section .text 448 %1_dispatch_init: 449 push mbin_rsi 450 push mbin_rax 451 push mbin_rbx 452 push mbin_rcx 453 push mbin_rdx 454 push mbin_rdi 455 lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function 456 457 mov eax, 1 458 cpuid 459 mov ebx, ecx ; save cpuid1.ecx 460 test ecx, FLAG_CPUID1_ECX_SSE4_2 461 je _%1_init_done ; Use base function if no SSE4_2 462 lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt 463 464 ;; Test for XMM_YMM support/AVX 465 test ecx, FLAG_CPUID1_ECX_OSXSAVE 466 je _%1_init_done 467 xor ecx, ecx 468 xgetbv ; xcr -> edx:eax 469 mov edi, eax ; save xgetvb.eax 470 471 and eax, FLAG_XGETBV_EAX_XMM_YMM 472 cmp eax, FLAG_XGETBV_EAX_XMM_YMM 473 jne _%1_init_done 474 test ebx, FLAG_CPUID1_ECX_AVX 475 je _%1_init_done 476 lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt 477 478 ;; Test for AVX2 479 xor ecx, ecx 480 mov eax, 7 481 cpuid 482 test ebx, FLAG_CPUID7_EBX_AVX2 483 je _%1_init_done ; No AVX2 possible 484 lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func 485 486 ;; Test for AVX512 487 and edi, FLAG_XGETBV_EAX_ZMM_OPM 488 cmp edi, FLAG_XGETBV_EAX_ZMM_OPM 489 jne _%1_check_avx2_g2 ; No AVX512 possible 490 and ebx, FLAGS_CPUID7_EBX_AVX512_G1 491 cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 492 lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt 493 cmove mbin_rsi, mbin_rbx 494 495 and ecx, FLAGS_CPUID7_ECX_AVX512_G2 496 cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2 497 lea mbin_rbx, [%8 WRT_OPT] ; AVX512/10 opt 498 cmove mbin_rsi, mbin_rbx 499 jmp _%1_init_done 500 501 _%1_check_avx2_g2: 502 ;; Test for AVX2 Gen 2 503 and ecx, FLAGS_CPUID7_ECX_AVX2_G2 504 cmp ecx, FLAGS_CPUID7_ECX_AVX2_G2 505 lea mbin_rbx, [%7 WRT_OPT] ; AVX2/7 opt 506 cmove mbin_rsi, mbin_rbx 507 508 _%1_init_done: 509 pop mbin_rdi 510 pop mbin_rdx 511 pop mbin_rcx 512 pop mbin_rbx 513 pop mbin_rax 514 mov [%1_dispatched], mbin_rsi 515 pop mbin_rsi 516 ret 517%endmacro 518%else 519%macro mbin_dispatch_init7 7 520 mbin_dispatch_init6 %1, %2, %3, %4, %5, %6 521%endmacro 522%macro mbin_dispatch_init8 8 523 mbin_dispatch_init6 %1, %2, %3, %4, %5, %6 524%endmacro 525%endif 526 527%endif ; ifndef _MULTIBINARY_ASM_ 528