xref: /isa-l/include/multibinary.asm (revision 1500db751d08b6c4ad6097135fe78259540a2807)
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%ifndef _MULTIBINARY_ASM_
31%define _MULTIBINARY_ASM_
32
33%ifidn __OUTPUT_FORMAT__, elf32
34 %define mbin_def_ptr	dd
35 %define mbin_ptr_sz	dword
36 %define mbin_rdi	edi
37 %define mbin_rsi	esi
38 %define mbin_rax	eax
39 %define mbin_rbx	ebx
40 %define mbin_rcx	ecx
41 %define mbin_rdx	edx
42%else
43 %define mbin_def_ptr	dq
44 %define mbin_ptr_sz	qword
45 %define mbin_rdi	rdi
46 %define mbin_rsi	rsi
47 %define mbin_rax	rax
48 %define mbin_rbx	rbx
49 %define mbin_rcx	rcx
50 %define mbin_rdx	rdx
51%endif
52
53%ifndef AS_FEATURE_LEVEL
54%define AS_FEATURE_LEVEL 4
55%endif
56
57;;;;
58; multibinary macro:
59;   creates the visible entry point that uses HW optimized call pointer
60;   creates the init of the HW optimized call pointer
61;;;;
62%macro mbin_interface 1
63	;;;;
64	; *_dispatched is defaulted to *_mbinit and replaced on first call.
65	; Therefore, *_dispatch_init is only executed on first call.
66	;;;;
67	section .data
68	%1_dispatched:
69		mbin_def_ptr	%1_mbinit
70
71	section .text
72	mk_global %1, function
73	%1_mbinit:
74		endbranch
75		;;; only called the first time to setup hardware match
76		call	%1_dispatch_init
77		;;; falls thru to execute the hw optimized code
78	%1:
79		endbranch
80		jmp	mbin_ptr_sz [%1_dispatched]
81%endmacro
82
83;;;;;
84; mbin_dispatch_init parameters
85; Use this function when SSE/00/01 is a minimum requirement
86; 1-> function name
87; 2-> SSE/00/01 optimized function used as base
88; 3-> AVX or AVX/02 opt func
89; 4-> AVX2 or AVX/04 opt func
90;;;;;
91%macro mbin_dispatch_init 4
92	section .text
93	%1_dispatch_init:
94		push	mbin_rsi
95		push	mbin_rax
96		push	mbin_rbx
97		push	mbin_rcx
98		push	mbin_rdx
99		lea	mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
100
101		mov	eax, 1
102		cpuid
103		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
104		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
105		lea	mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
106		jne	_%1_init_done ; AVX is not available so end
107		mov	mbin_rsi, mbin_rbx
108
109		;; Try for AVX2
110		xor	ecx, ecx
111		mov	eax, 7
112		cpuid
113		test	ebx, FLAG_CPUID7_EBX_AVX2
114		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
115		cmovne	mbin_rsi, mbin_rbx
116
117		;; Does it have xmm and ymm support
118		xor	ecx, ecx
119		xgetbv
120		and	eax, FLAG_XGETBV_EAX_XMM_YMM
121		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
122		je	_%1_init_done
123		lea	mbin_rsi, [%2 WRT_OPT]
124
125	_%1_init_done:
126		pop	mbin_rdx
127		pop	mbin_rcx
128		pop	mbin_rbx
129		pop	mbin_rax
130		mov	[%1_dispatched], mbin_rsi
131		pop	mbin_rsi
132		ret
133%endmacro
134
135;;;;;
136; mbin_dispatch_init2 parameters
137;  Cases where only base functions are available
138; 1-> function name
139; 2-> base function
140;;;;;
141%macro mbin_dispatch_init2 2
142	section .text
143	%1_dispatch_init:
144		push	mbin_rsi
145		lea	mbin_rsi, [%2 WRT_OPT] ; Default
146		mov	[%1_dispatched], mbin_rsi
147		pop	mbin_rsi
148		ret
149%endmacro
150
151;;;;;
152; mbin_dispatch_init_clmul 3 parameters
153; Use this case for CRC which needs both SSE4_1 and CLMUL
154; 1-> function name
155; 2-> base function
156; 3-> SSE4_1 and CLMUL optimized function
157; 4-> AVX/02 opt func
158; 5-> AVX512/10 opt func
159;;;;;
160%macro mbin_dispatch_init_clmul 5
161	section .text
162	%1_dispatch_init:
163		push	mbin_rsi
164		push	mbin_rax
165		push	mbin_rbx
166		push	mbin_rcx
167		push	mbin_rdx
168		push	mbin_rdi
169		lea     mbin_rsi, [%2 WRT_OPT] ; Default - use base function
170
171		mov     eax, 1
172		cpuid
173		mov	ebx, ecx ; save cpuid1.ecx
174		test	ecx, FLAG_CPUID1_ECX_SSE4_1
175		jz	_%1_init_done
176		test    ecx, FLAG_CPUID1_ECX_CLMUL
177		jz	_%1_init_done
178		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
179
180		;; Test for XMM_YMM support/AVX
181		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
182		je	_%1_init_done
183		xor	ecx, ecx
184		xgetbv	; xcr -> edx:eax
185		mov	edi, eax	  ; save xgetvb.eax
186
187		and	eax, FLAG_XGETBV_EAX_XMM_YMM
188		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
189		jne	_%1_init_done
190		test	ebx, FLAG_CPUID1_ECX_AVX
191		je	_%1_init_done
192		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
193
194%if AS_FEATURE_LEVEL >= 10
195		;; Test for AVX2
196		xor	ecx, ecx
197		mov	eax, 7
198		cpuid
199		test	ebx, FLAG_CPUID7_EBX_AVX2
200		je	_%1_init_done		; No AVX2 possible
201
202		;; Test for AVX512
203		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
204		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
205		jne	_%1_init_done	  ; No AVX512 possible
206		and	ebx, FLAGS_CPUID7_EBX_AVX512_G1
207		cmp	ebx, FLAGS_CPUID7_EBX_AVX512_G1
208		jne	_%1_init_done
209
210		and	ecx, FLAGS_CPUID7_ECX_AVX512_G2
211		cmp	ecx, FLAGS_CPUID7_ECX_AVX512_G2
212		lea	mbin_rbx, [%5 WRT_OPT] ; AVX512/10 opt
213		cmove	mbin_rsi, mbin_rbx
214%endif
215	_%1_init_done:
216		pop	mbin_rdi
217		pop	mbin_rdx
218		pop	mbin_rcx
219		pop	mbin_rbx
220		pop	mbin_rax
221		mov	[%1_dispatched], mbin_rsi
222		pop	mbin_rsi
223		ret
224%endmacro
225
226;;;;;
227; mbin_dispatch_init5 parameters
228; 1-> function name
229; 2-> base function
230; 3-> SSE4_2 or 00/01 optimized function
231; 4-> AVX/02 opt func
232; 5-> AVX2/04 opt func
233;;;;;
234%macro mbin_dispatch_init5 5
235	section .text
236	%1_dispatch_init:
237		push	mbin_rsi
238		push	mbin_rax
239		push	mbin_rbx
240		push	mbin_rcx
241		push	mbin_rdx
242		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
243
244		mov	eax, 1
245		cpuid
246		; Test for SSE4.2
247		test	ecx, FLAG_CPUID1_ECX_SSE4_2
248		lea	mbin_rbx, [%3 WRT_OPT] ; SSE opt func
249		cmovne	mbin_rsi, mbin_rbx
250
251		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
252		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
253		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func
254		jne	_%1_init_done ; AVX is not available so end
255		mov	mbin_rsi, mbin_rbx
256
257		;; Try for AVX2
258		xor	ecx, ecx
259		mov	eax, 7
260		cpuid
261		test	ebx, FLAG_CPUID7_EBX_AVX2
262		lea	mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func
263		cmovne	mbin_rsi, mbin_rbx
264
265		;; Does it have xmm and ymm support
266		xor	ecx, ecx
267		xgetbv
268		and	eax, FLAG_XGETBV_EAX_XMM_YMM
269		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
270		je	_%1_init_done
271		lea	mbin_rsi, [%3 WRT_OPT]
272
273	_%1_init_done:
274		pop	mbin_rdx
275		pop	mbin_rcx
276		pop	mbin_rbx
277		pop	mbin_rax
278		mov	[%1_dispatched], mbin_rsi
279		pop	mbin_rsi
280		ret
281%endmacro
282
283%if AS_FEATURE_LEVEL >= 6
284;;;;;
285; mbin_dispatch_init6 parameters
286; 1-> function name
287; 2-> base function
288; 3-> SSE4_2 or 00/01 optimized function
289; 4-> AVX/02 opt func
290; 5-> AVX2/04 opt func
291; 6-> AVX512/06 opt func
292;;;;;
293%macro mbin_dispatch_init6 6
294	section .text
295	%1_dispatch_init:
296		push	mbin_rsi
297		push	mbin_rax
298		push	mbin_rbx
299		push	mbin_rcx
300		push	mbin_rdx
301		push	mbin_rdi
302		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
303
304		mov	eax, 1
305		cpuid
306		mov	ebx, ecx ; save cpuid1.ecx
307		test	ecx, FLAG_CPUID1_ECX_SSE4_2
308		je	_%1_init_done	  ; Use base function if no SSE4_2
309		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
310
311		;; Test for XMM_YMM support/AVX
312		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
313		je	_%1_init_done
314		xor	ecx, ecx
315		xgetbv	; xcr -> edx:eax
316		mov	edi, eax	  ; save xgetvb.eax
317
318		and	eax, FLAG_XGETBV_EAX_XMM_YMM
319		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
320		jne	_%1_init_done
321		test	ebx, FLAG_CPUID1_ECX_AVX
322		je	_%1_init_done
323		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
324
325		;; Test for AVX2
326		xor	ecx, ecx
327		mov	eax, 7
328		cpuid
329		test	ebx, FLAG_CPUID7_EBX_AVX2
330		je	_%1_init_done		; No AVX2 possible
331		lea	mbin_rsi, [%5 WRT_OPT] 	; AVX2/04 opt func
332
333		;; Test for AVX512
334		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
335		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
336		jne	_%1_init_done	  ; No AVX512 possible
337		and	ebx, FLAGS_CPUID7_EBX_AVX512_G1
338		cmp	ebx, FLAGS_CPUID7_EBX_AVX512_G1
339		lea	mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
340		cmove	mbin_rsi, mbin_rbx
341
342	_%1_init_done:
343		pop	mbin_rdi
344		pop	mbin_rdx
345		pop	mbin_rcx
346		pop	mbin_rbx
347		pop	mbin_rax
348		mov	[%1_dispatched], mbin_rsi
349		pop	mbin_rsi
350		ret
351%endmacro
352
353%else
354%macro mbin_dispatch_init6 6
355	mbin_dispatch_init5 %1, %2, %3, %4, %5
356%endmacro
357%endif
358
359%if AS_FEATURE_LEVEL >= 10
360;;;;;
361; mbin_dispatch_init7 parameters
362; 1-> function name
363; 2-> base function
364; 3-> SSE4_2 or 00/01 optimized function
365; 4-> AVX/02 opt func
366; 5-> AVX2/04 opt func
367; 6-> AVX512/06 opt func
368; 7-> AVX512 Update/10 opt func
369;;;;;
370%macro mbin_dispatch_init7 7
371	section .text
372	%1_dispatch_init:
373		push	mbin_rsi
374		push	mbin_rax
375		push	mbin_rbx
376		push	mbin_rcx
377		push	mbin_rdx
378		push	mbin_rdi
379		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
380
381		mov	eax, 1
382		cpuid
383		mov	ebx, ecx ; save cpuid1.ecx
384		test	ecx, FLAG_CPUID1_ECX_SSE4_2
385		je	_%1_init_done	  ; Use base function if no SSE4_2
386		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
387
388		;; Test for XMM_YMM support/AVX
389		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
390		je	_%1_init_done
391		xor	ecx, ecx
392		xgetbv	; xcr -> edx:eax
393		mov	edi, eax	  ; save xgetvb.eax
394
395		and	eax, FLAG_XGETBV_EAX_XMM_YMM
396		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
397		jne	_%1_init_done
398		test	ebx, FLAG_CPUID1_ECX_AVX
399		je	_%1_init_done
400		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
401
402		;; Test for AVX2
403		xor	ecx, ecx
404		mov	eax, 7
405		cpuid
406		test	ebx, FLAG_CPUID7_EBX_AVX2
407		je	_%1_init_done		; No AVX2 possible
408		lea	mbin_rsi, [%5 WRT_OPT] 	; AVX2/04 opt func
409
410		;; Test for AVX512
411		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
412		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
413		jne	_%1_init_done	  ; No AVX512 possible
414		and	ebx, FLAGS_CPUID7_EBX_AVX512_G1
415		cmp	ebx, FLAGS_CPUID7_EBX_AVX512_G1
416		lea	mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
417		cmove	mbin_rsi, mbin_rbx
418
419		and	ecx, FLAGS_CPUID7_ECX_AVX512_G2
420		cmp	ecx, FLAGS_CPUID7_ECX_AVX512_G2
421		lea	mbin_rbx, [%7 WRT_OPT] ; AVX512/06 opt
422		cmove	mbin_rsi, mbin_rbx
423
424	_%1_init_done:
425		pop	mbin_rdi
426		pop	mbin_rdx
427		pop	mbin_rcx
428		pop	mbin_rbx
429		pop	mbin_rax
430		mov	[%1_dispatched], mbin_rsi
431		pop	mbin_rsi
432		ret
433%endmacro
434
435;;;;;
436; mbin_dispatch_init8 parameters
437; 1-> function name
438; 2-> base function
439; 3-> SSE4_2 or 00/01 optimized function
440; 4-> AVX/02 opt func
441; 5-> AVX2/04 opt func
442; 6-> AVX512/06 opt func
443; 7-> AVX2 Update/07 opt func
444; 8-> AVX512 Update/10 opt func
445;;;;;
446%macro mbin_dispatch_init8 8
447	section .text
448	%1_dispatch_init:
449		push	mbin_rsi
450		push	mbin_rax
451		push	mbin_rbx
452		push	mbin_rcx
453		push	mbin_rdx
454		push	mbin_rdi
455		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
456
457		mov	eax, 1
458		cpuid
459		mov	ebx, ecx ; save cpuid1.ecx
460		test	ecx, FLAG_CPUID1_ECX_SSE4_2
461		je	_%1_init_done	  ; Use base function if no SSE4_2
462		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
463
464		;; Test for XMM_YMM support/AVX
465		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
466		je	_%1_init_done
467		xor	ecx, ecx
468		xgetbv	; xcr -> edx:eax
469		mov	edi, eax	  ; save xgetvb.eax
470
471		and	eax, FLAG_XGETBV_EAX_XMM_YMM
472		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
473		jne	_%1_init_done
474		test	ebx, FLAG_CPUID1_ECX_AVX
475		je	_%1_init_done
476		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
477
478		;; Test for AVX2
479		xor	ecx, ecx
480		mov	eax, 7
481		cpuid
482		test	ebx, FLAG_CPUID7_EBX_AVX2
483		je	_%1_init_done		; No AVX2 possible
484		lea	mbin_rsi, [%5 WRT_OPT] 	; AVX2/04 opt func
485
486		;; Test for AVX512
487		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
488		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
489		jne	_%1_check_avx2_g2	  ; No AVX512 possible
490		and	ebx, FLAGS_CPUID7_EBX_AVX512_G1
491		cmp	ebx, FLAGS_CPUID7_EBX_AVX512_G1
492		lea	mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
493		cmove	mbin_rsi, mbin_rbx
494
495		and	ecx, FLAGS_CPUID7_ECX_AVX512_G2
496		cmp	ecx, FLAGS_CPUID7_ECX_AVX512_G2
497		lea	mbin_rbx, [%8 WRT_OPT] ; AVX512/10 opt
498		cmove	mbin_rsi, mbin_rbx
499		jmp     _%1_init_done
500
501	_%1_check_avx2_g2:
502		;; Test for AVX2 Gen 2
503		and	ecx, FLAGS_CPUID7_ECX_AVX2_G2
504		cmp	ecx, FLAGS_CPUID7_ECX_AVX2_G2
505		lea	mbin_rbx, [%7 WRT_OPT] ; AVX2/7 opt
506		cmove	mbin_rsi, mbin_rbx
507
508	_%1_init_done:
509		pop	mbin_rdi
510		pop	mbin_rdx
511		pop	mbin_rcx
512		pop	mbin_rbx
513		pop	mbin_rax
514		mov	[%1_dispatched], mbin_rsi
515		pop	mbin_rsi
516		ret
517%endmacro
518%else
519%macro mbin_dispatch_init7 7
520	mbin_dispatch_init6 %1, %2, %3, %4, %5, %6
521%endmacro
522%macro mbin_dispatch_init8 8
523	mbin_dispatch_init6 %1, %2, %3, %4, %5, %6
524%endmacro
525%endif
526
527%endif ; ifndef _MULTIBINARY_ASM_
528