xref: /onnv-gate/usr/src/common/openssl/crypto/bn/asm/bn-586.pl (revision 2139:6243c3338933)
10Sstevel@tonic-gate#!/usr/local/bin/perl
20Sstevel@tonic-gate
30Sstevel@tonic-gatepush(@INC,"perlasm","../../perlasm");
40Sstevel@tonic-gaterequire "x86asm.pl";
50Sstevel@tonic-gate
60Sstevel@tonic-gate&asm_init($ARGV[0],$0);
70Sstevel@tonic-gate
8*2139Sjp161948$sse2=0;
9*2139Sjp161948for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
10*2139Sjp161948
11*2139Sjp161948&external_label("OPENSSL_ia32cap_P") if ($sse2);
12*2139Sjp161948
130Sstevel@tonic-gate&bn_mul_add_words("bn_mul_add_words");
140Sstevel@tonic-gate&bn_mul_words("bn_mul_words");
150Sstevel@tonic-gate&bn_sqr_words("bn_sqr_words");
160Sstevel@tonic-gate&bn_div_words("bn_div_words");
170Sstevel@tonic-gate&bn_add_words("bn_add_words");
180Sstevel@tonic-gate&bn_sub_words("bn_sub_words");
19*2139Sjp161948&bn_sub_part_words("bn_sub_part_words");
200Sstevel@tonic-gate
210Sstevel@tonic-gate&asm_finish();
220Sstevel@tonic-gate
230Sstevel@tonic-gatesub bn_mul_add_words
240Sstevel@tonic-gate	{
250Sstevel@tonic-gate	local($name)=@_;
260Sstevel@tonic-gate
27*2139Sjp161948	&function_begin($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
280Sstevel@tonic-gate
290Sstevel@tonic-gate	&comment("");
300Sstevel@tonic-gate	$Low="eax";
310Sstevel@tonic-gate	$High="edx";
320Sstevel@tonic-gate	$a="ebx";
330Sstevel@tonic-gate	$w="ebp";
340Sstevel@tonic-gate	$r="edi";
350Sstevel@tonic-gate	$c="esi";
360Sstevel@tonic-gate
370Sstevel@tonic-gate	&xor($c,$c);		# clear carry
380Sstevel@tonic-gate	&mov($r,&wparam(0));	#
390Sstevel@tonic-gate
400Sstevel@tonic-gate	&mov("ecx",&wparam(2));	#
410Sstevel@tonic-gate	&mov($a,&wparam(1));	#
420Sstevel@tonic-gate
430Sstevel@tonic-gate	&and("ecx",0xfffffff8);	# num / 8
440Sstevel@tonic-gate	&mov($w,&wparam(3));	#
450Sstevel@tonic-gate
460Sstevel@tonic-gate	&push("ecx");		# Up the stack for a tmp variable
470Sstevel@tonic-gate
480Sstevel@tonic-gate	&jz(&label("maw_finish"));
490Sstevel@tonic-gate
50*2139Sjp161948	if ($sse2) {
51*2139Sjp161948		&picmeup("eax","OPENSSL_ia32cap_P");
52*2139Sjp161948		&bt(&DWP(0,"eax"),26);
53*2139Sjp161948		&jnc(&label("maw_loop"));
54*2139Sjp161948
55*2139Sjp161948		&movd("mm0",$w);		# mm0 = w
56*2139Sjp161948		&pxor("mm1","mm1");		# mm1 = carry_in
57*2139Sjp161948
58*2139Sjp161948		&set_label("maw_sse2_loop",0);
59*2139Sjp161948		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
60*2139Sjp161948		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
61*2139Sjp161948		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
62*2139Sjp161948		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
63*2139Sjp161948		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
64*2139Sjp161948		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
65*2139Sjp161948		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
66*2139Sjp161948		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
67*2139Sjp161948		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
68*2139Sjp161948		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
69*2139Sjp161948		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
70*2139Sjp161948		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
71*2139Sjp161948		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
72*2139Sjp161948		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
73*2139Sjp161948		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
74*2139Sjp161948		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
75*2139Sjp161948		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
76*2139Sjp161948		&movd(&DWP(0,$r,"",0),"mm1");
77*2139Sjp161948		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
78*2139Sjp161948		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
79*2139Sjp161948		&psrlq("mm1",32);		# mm1 = carry0
80*2139Sjp161948		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
81*2139Sjp161948		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
82*2139Sjp161948		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
83*2139Sjp161948		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
84*2139Sjp161948		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
85*2139Sjp161948		&movd(&DWP(4,$r,"",0),"mm1");
86*2139Sjp161948		&psrlq("mm1",32);		# mm1 = carry1
87*2139Sjp161948		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
88*2139Sjp161948		&add($a,32);
89*2139Sjp161948		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
90*2139Sjp161948		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
91*2139Sjp161948		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
92*2139Sjp161948		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
93*2139Sjp161948		&movd(&DWP(8,$r,"",0),"mm1");
94*2139Sjp161948		&psrlq("mm1",32);		# mm1 = carry2
95*2139Sjp161948		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
96*2139Sjp161948		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
97*2139Sjp161948		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
98*2139Sjp161948		&movd(&DWP(12,$r,"",0),"mm1");
99*2139Sjp161948		&psrlq("mm1",32);		# mm1 = carry3
100*2139Sjp161948		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
101*2139Sjp161948		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
102*2139Sjp161948		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
103*2139Sjp161948		&movd(&DWP(16,$r,"",0),"mm1");
104*2139Sjp161948		&psrlq("mm1",32);		# mm1 = carry4
105*2139Sjp161948		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
106*2139Sjp161948		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
107*2139Sjp161948		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
108*2139Sjp161948		&movd(&DWP(20,$r,"",0),"mm1");
109*2139Sjp161948		&psrlq("mm1",32);		# mm1 = carry5
110*2139Sjp161948		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
111*2139Sjp161948		&movd(&DWP(24,$r,"",0),"mm1");
112*2139Sjp161948		&psrlq("mm1",32);		# mm1 = carry6
113*2139Sjp161948		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
114*2139Sjp161948		&movd(&DWP(28,$r,"",0),"mm1");
115*2139Sjp161948		&add($r,32);
116*2139Sjp161948		&psrlq("mm1",32);		# mm1 = carry_out
117*2139Sjp161948
118*2139Sjp161948		&sub("ecx",8);
119*2139Sjp161948		&jnz(&label("maw_sse2_loop"));
120*2139Sjp161948
121*2139Sjp161948		&movd($c,"mm1");		# c = carry_out
122*2139Sjp161948		&emms();
123*2139Sjp161948
124*2139Sjp161948		&jmp(&label("maw_finish"));
125*2139Sjp161948	}
126*2139Sjp161948
1270Sstevel@tonic-gate	&set_label("maw_loop",0);
1280Sstevel@tonic-gate
1290Sstevel@tonic-gate	&mov(&swtmp(0),"ecx");	#
1300Sstevel@tonic-gate
1310Sstevel@tonic-gate	for ($i=0; $i<32; $i+=4)
1320Sstevel@tonic-gate		{
1330Sstevel@tonic-gate		&comment("Round $i");
1340Sstevel@tonic-gate
1350Sstevel@tonic-gate		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
1360Sstevel@tonic-gate		&mul($w);			# *a * w
1370Sstevel@tonic-gate		&add("eax",$c);		# L(t)+= *r
1380Sstevel@tonic-gate		 &mov($c,&DWP($i,$r,"",0));	# L(t)+= *r
1390Sstevel@tonic-gate		&adc("edx",0);			# H(t)+=carry
1400Sstevel@tonic-gate		 &add("eax",$c);		# L(t)+=c
1410Sstevel@tonic-gate		&adc("edx",0);			# H(t)+=carry
1420Sstevel@tonic-gate		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
1430Sstevel@tonic-gate		&mov($c,"edx");			# c=  H(t);
1440Sstevel@tonic-gate		}
1450Sstevel@tonic-gate
1460Sstevel@tonic-gate	&comment("");
1470Sstevel@tonic-gate	&mov("ecx",&swtmp(0));	#
1480Sstevel@tonic-gate	&add($a,32);
1490Sstevel@tonic-gate	&add($r,32);
1500Sstevel@tonic-gate	&sub("ecx",8);
1510Sstevel@tonic-gate	&jnz(&label("maw_loop"));
1520Sstevel@tonic-gate
1530Sstevel@tonic-gate	&set_label("maw_finish",0);
1540Sstevel@tonic-gate	&mov("ecx",&wparam(2));	# get num
1550Sstevel@tonic-gate	&and("ecx",7);
1560Sstevel@tonic-gate	&jnz(&label("maw_finish2"));	# helps branch prediction
1570Sstevel@tonic-gate	&jmp(&label("maw_end"));
1580Sstevel@tonic-gate
1590Sstevel@tonic-gate	&set_label("maw_finish2",1);
1600Sstevel@tonic-gate	for ($i=0; $i<7; $i++)
1610Sstevel@tonic-gate		{
1620Sstevel@tonic-gate		&comment("Tail Round $i");
1630Sstevel@tonic-gate		 &mov("eax",&DWP($i*4,$a,"",0));# *a
1640Sstevel@tonic-gate		&mul($w);			# *a * w
1650Sstevel@tonic-gate		&add("eax",$c);			# L(t)+=c
1660Sstevel@tonic-gate		 &mov($c,&DWP($i*4,$r,"",0));	# L(t)+= *r
1670Sstevel@tonic-gate		&adc("edx",0);			# H(t)+=carry
1680Sstevel@tonic-gate		 &add("eax",$c);
1690Sstevel@tonic-gate		&adc("edx",0);			# H(t)+=carry
1700Sstevel@tonic-gate		 &dec("ecx") if ($i != 7-1);
1710Sstevel@tonic-gate		&mov(&DWP($i*4,$r,"",0),"eax");	# *r= L(t);
1720Sstevel@tonic-gate		 &mov($c,"edx");			# c=  H(t);
1730Sstevel@tonic-gate		&jz(&label("maw_end")) if ($i != 7-1);
1740Sstevel@tonic-gate		}
1750Sstevel@tonic-gate	&set_label("maw_end",0);
1760Sstevel@tonic-gate	&mov("eax",$c);
1770Sstevel@tonic-gate
1780Sstevel@tonic-gate	&pop("ecx");	# clear variable from
1790Sstevel@tonic-gate
1800Sstevel@tonic-gate	&function_end($name);
1810Sstevel@tonic-gate	}
1820Sstevel@tonic-gate
1830Sstevel@tonic-gatesub bn_mul_words
1840Sstevel@tonic-gate	{
1850Sstevel@tonic-gate	local($name)=@_;
1860Sstevel@tonic-gate
1870Sstevel@tonic-gate	&function_begin($name,"");
1880Sstevel@tonic-gate
1890Sstevel@tonic-gate	&comment("");
1900Sstevel@tonic-gate	$Low="eax";
1910Sstevel@tonic-gate	$High="edx";
1920Sstevel@tonic-gate	$a="ebx";
1930Sstevel@tonic-gate	$w="ecx";
1940Sstevel@tonic-gate	$r="edi";
1950Sstevel@tonic-gate	$c="esi";
1960Sstevel@tonic-gate	$num="ebp";
1970Sstevel@tonic-gate
1980Sstevel@tonic-gate	&xor($c,$c);		# clear carry
1990Sstevel@tonic-gate	&mov($r,&wparam(0));	#
2000Sstevel@tonic-gate	&mov($a,&wparam(1));	#
2010Sstevel@tonic-gate	&mov($num,&wparam(2));	#
2020Sstevel@tonic-gate	&mov($w,&wparam(3));	#
2030Sstevel@tonic-gate
2040Sstevel@tonic-gate	&and($num,0xfffffff8);	# num / 8
2050Sstevel@tonic-gate	&jz(&label("mw_finish"));
2060Sstevel@tonic-gate
2070Sstevel@tonic-gate	&set_label("mw_loop",0);
2080Sstevel@tonic-gate	for ($i=0; $i<32; $i+=4)
2090Sstevel@tonic-gate		{
2100Sstevel@tonic-gate		&comment("Round $i");
2110Sstevel@tonic-gate
2120Sstevel@tonic-gate		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
2130Sstevel@tonic-gate		&mul($w);			# *a * w
2140Sstevel@tonic-gate		&add("eax",$c);			# L(t)+=c
2150Sstevel@tonic-gate		 # XXX
2160Sstevel@tonic-gate
2170Sstevel@tonic-gate		&adc("edx",0);			# H(t)+=carry
2180Sstevel@tonic-gate		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
2190Sstevel@tonic-gate
2200Sstevel@tonic-gate		&mov($c,"edx");			# c=  H(t);
2210Sstevel@tonic-gate		}
2220Sstevel@tonic-gate
2230Sstevel@tonic-gate	&comment("");
2240Sstevel@tonic-gate	&add($a,32);
2250Sstevel@tonic-gate	&add($r,32);
2260Sstevel@tonic-gate	&sub($num,8);
2270Sstevel@tonic-gate	&jz(&label("mw_finish"));
2280Sstevel@tonic-gate	&jmp(&label("mw_loop"));
2290Sstevel@tonic-gate
2300Sstevel@tonic-gate	&set_label("mw_finish",0);
2310Sstevel@tonic-gate	&mov($num,&wparam(2));	# get num
2320Sstevel@tonic-gate	&and($num,7);
2330Sstevel@tonic-gate	&jnz(&label("mw_finish2"));
2340Sstevel@tonic-gate	&jmp(&label("mw_end"));
2350Sstevel@tonic-gate
2360Sstevel@tonic-gate	&set_label("mw_finish2",1);
2370Sstevel@tonic-gate	for ($i=0; $i<7; $i++)
2380Sstevel@tonic-gate		{
2390Sstevel@tonic-gate		&comment("Tail Round $i");
2400Sstevel@tonic-gate		 &mov("eax",&DWP($i*4,$a,"",0));# *a
2410Sstevel@tonic-gate		&mul($w);			# *a * w
2420Sstevel@tonic-gate		&add("eax",$c);			# L(t)+=c
2430Sstevel@tonic-gate		 # XXX
2440Sstevel@tonic-gate		&adc("edx",0);			# H(t)+=carry
2450Sstevel@tonic-gate		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
2460Sstevel@tonic-gate		&mov($c,"edx");			# c=  H(t);
2470Sstevel@tonic-gate		 &dec($num) if ($i != 7-1);
2480Sstevel@tonic-gate		&jz(&label("mw_end")) if ($i != 7-1);
2490Sstevel@tonic-gate		}
2500Sstevel@tonic-gate	&set_label("mw_end",0);
2510Sstevel@tonic-gate	&mov("eax",$c);
2520Sstevel@tonic-gate
2530Sstevel@tonic-gate	&function_end($name);
2540Sstevel@tonic-gate	}
2550Sstevel@tonic-gate
2560Sstevel@tonic-gatesub bn_sqr_words
2570Sstevel@tonic-gate	{
2580Sstevel@tonic-gate	local($name)=@_;
2590Sstevel@tonic-gate
2600Sstevel@tonic-gate	&function_begin($name,"");
2610Sstevel@tonic-gate
2620Sstevel@tonic-gate	&comment("");
2630Sstevel@tonic-gate	$r="esi";
2640Sstevel@tonic-gate	$a="edi";
2650Sstevel@tonic-gate	$num="ebx";
2660Sstevel@tonic-gate
2670Sstevel@tonic-gate	&mov($r,&wparam(0));	#
2680Sstevel@tonic-gate	&mov($a,&wparam(1));	#
2690Sstevel@tonic-gate	&mov($num,&wparam(2));	#
2700Sstevel@tonic-gate
2710Sstevel@tonic-gate	&and($num,0xfffffff8);	# num / 8
2720Sstevel@tonic-gate	&jz(&label("sw_finish"));
2730Sstevel@tonic-gate
2740Sstevel@tonic-gate	&set_label("sw_loop",0);
2750Sstevel@tonic-gate	for ($i=0; $i<32; $i+=4)
2760Sstevel@tonic-gate		{
2770Sstevel@tonic-gate		&comment("Round $i");
2780Sstevel@tonic-gate		&mov("eax",&DWP($i,$a,"",0)); 	# *a
2790Sstevel@tonic-gate		 # XXX
2800Sstevel@tonic-gate		&mul("eax");			# *a * *a
2810Sstevel@tonic-gate		&mov(&DWP($i*2,$r,"",0),"eax");	#
2820Sstevel@tonic-gate		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
2830Sstevel@tonic-gate		}
2840Sstevel@tonic-gate
2850Sstevel@tonic-gate	&comment("");
2860Sstevel@tonic-gate	&add($a,32);
2870Sstevel@tonic-gate	&add($r,64);
2880Sstevel@tonic-gate	&sub($num,8);
2890Sstevel@tonic-gate	&jnz(&label("sw_loop"));
2900Sstevel@tonic-gate
2910Sstevel@tonic-gate	&set_label("sw_finish",0);
2920Sstevel@tonic-gate	&mov($num,&wparam(2));	# get num
2930Sstevel@tonic-gate	&and($num,7);
2940Sstevel@tonic-gate	&jz(&label("sw_end"));
2950Sstevel@tonic-gate
2960Sstevel@tonic-gate	for ($i=0; $i<7; $i++)
2970Sstevel@tonic-gate		{
2980Sstevel@tonic-gate		&comment("Tail Round $i");
2990Sstevel@tonic-gate		&mov("eax",&DWP($i*4,$a,"",0));	# *a
3000Sstevel@tonic-gate		 # XXX
3010Sstevel@tonic-gate		&mul("eax");			# *a * *a
3020Sstevel@tonic-gate		&mov(&DWP($i*8,$r,"",0),"eax");	#
3030Sstevel@tonic-gate		 &dec($num) if ($i != 7-1);
3040Sstevel@tonic-gate		&mov(&DWP($i*8+4,$r,"",0),"edx");
3050Sstevel@tonic-gate		 &jz(&label("sw_end")) if ($i != 7-1);
3060Sstevel@tonic-gate		}
3070Sstevel@tonic-gate	&set_label("sw_end",0);
3080Sstevel@tonic-gate
3090Sstevel@tonic-gate	&function_end($name);
3100Sstevel@tonic-gate	}
3110Sstevel@tonic-gate
3120Sstevel@tonic-gatesub bn_div_words
3130Sstevel@tonic-gate	{
3140Sstevel@tonic-gate	local($name)=@_;
3150Sstevel@tonic-gate
3160Sstevel@tonic-gate	&function_begin($name,"");
3170Sstevel@tonic-gate	&mov("edx",&wparam(0));	#
3180Sstevel@tonic-gate	&mov("eax",&wparam(1));	#
3190Sstevel@tonic-gate	&mov("ebx",&wparam(2));	#
3200Sstevel@tonic-gate	&div("ebx");
3210Sstevel@tonic-gate	&function_end($name);
3220Sstevel@tonic-gate	}
3230Sstevel@tonic-gate
3240Sstevel@tonic-gatesub bn_add_words
3250Sstevel@tonic-gate	{
3260Sstevel@tonic-gate	local($name)=@_;
3270Sstevel@tonic-gate
3280Sstevel@tonic-gate	&function_begin($name,"");
3290Sstevel@tonic-gate
3300Sstevel@tonic-gate	&comment("");
3310Sstevel@tonic-gate	$a="esi";
3320Sstevel@tonic-gate	$b="edi";
3330Sstevel@tonic-gate	$c="eax";
3340Sstevel@tonic-gate	$r="ebx";
3350Sstevel@tonic-gate	$tmp1="ecx";
3360Sstevel@tonic-gate	$tmp2="edx";
3370Sstevel@tonic-gate	$num="ebp";
3380Sstevel@tonic-gate
3390Sstevel@tonic-gate	&mov($r,&wparam(0));	# get r
3400Sstevel@tonic-gate	 &mov($a,&wparam(1));	# get a
3410Sstevel@tonic-gate	&mov($b,&wparam(2));	# get b
3420Sstevel@tonic-gate	 &mov($num,&wparam(3));	# get num
3430Sstevel@tonic-gate	&xor($c,$c);		# clear carry
3440Sstevel@tonic-gate	 &and($num,0xfffffff8);	# num / 8
3450Sstevel@tonic-gate
3460Sstevel@tonic-gate	&jz(&label("aw_finish"));
3470Sstevel@tonic-gate
3480Sstevel@tonic-gate	&set_label("aw_loop",0);
3490Sstevel@tonic-gate	for ($i=0; $i<8; $i++)
3500Sstevel@tonic-gate		{
3510Sstevel@tonic-gate		&comment("Round $i");
3520Sstevel@tonic-gate
3530Sstevel@tonic-gate		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
3540Sstevel@tonic-gate		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
3550Sstevel@tonic-gate		&add($tmp1,$c);
3560Sstevel@tonic-gate		 &mov($c,0);
3570Sstevel@tonic-gate		&adc($c,$c);
3580Sstevel@tonic-gate		 &add($tmp1,$tmp2);
3590Sstevel@tonic-gate		&adc($c,0);
3600Sstevel@tonic-gate		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
3610Sstevel@tonic-gate		}
3620Sstevel@tonic-gate
3630Sstevel@tonic-gate	&comment("");
3640Sstevel@tonic-gate	&add($a,32);
3650Sstevel@tonic-gate	 &add($b,32);
3660Sstevel@tonic-gate	&add($r,32);
3670Sstevel@tonic-gate	 &sub($num,8);
3680Sstevel@tonic-gate	&jnz(&label("aw_loop"));
3690Sstevel@tonic-gate
3700Sstevel@tonic-gate	&set_label("aw_finish",0);
3710Sstevel@tonic-gate	&mov($num,&wparam(3));	# get num
3720Sstevel@tonic-gate	&and($num,7);
3730Sstevel@tonic-gate	 &jz(&label("aw_end"));
3740Sstevel@tonic-gate
3750Sstevel@tonic-gate	for ($i=0; $i<7; $i++)
3760Sstevel@tonic-gate		{
3770Sstevel@tonic-gate		&comment("Tail Round $i");
3780Sstevel@tonic-gate		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
3790Sstevel@tonic-gate		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
3800Sstevel@tonic-gate		&add($tmp1,$c);
3810Sstevel@tonic-gate		 &mov($c,0);
3820Sstevel@tonic-gate		&adc($c,$c);
3830Sstevel@tonic-gate		 &add($tmp1,$tmp2);
3840Sstevel@tonic-gate		&adc($c,0);
3850Sstevel@tonic-gate		 &dec($num) if ($i != 6);
3860Sstevel@tonic-gate		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
3870Sstevel@tonic-gate		 &jz(&label("aw_end")) if ($i != 6);
3880Sstevel@tonic-gate		}
3890Sstevel@tonic-gate	&set_label("aw_end",0);
3900Sstevel@tonic-gate
3910Sstevel@tonic-gate#	&mov("eax",$c);		# $c is "eax"
3920Sstevel@tonic-gate
3930Sstevel@tonic-gate	&function_end($name);
3940Sstevel@tonic-gate	}
3950Sstevel@tonic-gate
3960Sstevel@tonic-gatesub bn_sub_words
3970Sstevel@tonic-gate	{
3980Sstevel@tonic-gate	local($name)=@_;
3990Sstevel@tonic-gate
4000Sstevel@tonic-gate	&function_begin($name,"");
4010Sstevel@tonic-gate
4020Sstevel@tonic-gate	&comment("");
4030Sstevel@tonic-gate	$a="esi";
4040Sstevel@tonic-gate	$b="edi";
4050Sstevel@tonic-gate	$c="eax";
4060Sstevel@tonic-gate	$r="ebx";
4070Sstevel@tonic-gate	$tmp1="ecx";
4080Sstevel@tonic-gate	$tmp2="edx";
4090Sstevel@tonic-gate	$num="ebp";
4100Sstevel@tonic-gate
4110Sstevel@tonic-gate	&mov($r,&wparam(0));	# get r
4120Sstevel@tonic-gate	 &mov($a,&wparam(1));	# get a
4130Sstevel@tonic-gate	&mov($b,&wparam(2));	# get b
4140Sstevel@tonic-gate	 &mov($num,&wparam(3));	# get num
4150Sstevel@tonic-gate	&xor($c,$c);		# clear carry
4160Sstevel@tonic-gate	 &and($num,0xfffffff8);	# num / 8
4170Sstevel@tonic-gate
4180Sstevel@tonic-gate	&jz(&label("aw_finish"));
4190Sstevel@tonic-gate
4200Sstevel@tonic-gate	&set_label("aw_loop",0);
4210Sstevel@tonic-gate	for ($i=0; $i<8; $i++)
4220Sstevel@tonic-gate		{
4230Sstevel@tonic-gate		&comment("Round $i");
4240Sstevel@tonic-gate
4250Sstevel@tonic-gate		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
4260Sstevel@tonic-gate		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
4270Sstevel@tonic-gate		&sub($tmp1,$c);
4280Sstevel@tonic-gate		 &mov($c,0);
4290Sstevel@tonic-gate		&adc($c,$c);
4300Sstevel@tonic-gate		 &sub($tmp1,$tmp2);
4310Sstevel@tonic-gate		&adc($c,0);
4320Sstevel@tonic-gate		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
4330Sstevel@tonic-gate		}
4340Sstevel@tonic-gate
4350Sstevel@tonic-gate	&comment("");
4360Sstevel@tonic-gate	&add($a,32);
4370Sstevel@tonic-gate	 &add($b,32);
4380Sstevel@tonic-gate	&add($r,32);
4390Sstevel@tonic-gate	 &sub($num,8);
4400Sstevel@tonic-gate	&jnz(&label("aw_loop"));
4410Sstevel@tonic-gate
4420Sstevel@tonic-gate	&set_label("aw_finish",0);
4430Sstevel@tonic-gate	&mov($num,&wparam(3));	# get num
4440Sstevel@tonic-gate	&and($num,7);
4450Sstevel@tonic-gate	 &jz(&label("aw_end"));
4460Sstevel@tonic-gate
4470Sstevel@tonic-gate	for ($i=0; $i<7; $i++)
4480Sstevel@tonic-gate		{
4490Sstevel@tonic-gate		&comment("Tail Round $i");
4500Sstevel@tonic-gate		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
4510Sstevel@tonic-gate		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
4520Sstevel@tonic-gate		&sub($tmp1,$c);
4530Sstevel@tonic-gate		 &mov($c,0);
4540Sstevel@tonic-gate		&adc($c,$c);
4550Sstevel@tonic-gate		 &sub($tmp1,$tmp2);
4560Sstevel@tonic-gate		&adc($c,0);
4570Sstevel@tonic-gate		 &dec($num) if ($i != 6);
4580Sstevel@tonic-gate		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
4590Sstevel@tonic-gate		 &jz(&label("aw_end")) if ($i != 6);
4600Sstevel@tonic-gate		}
4610Sstevel@tonic-gate	&set_label("aw_end",0);
4620Sstevel@tonic-gate
4630Sstevel@tonic-gate#	&mov("eax",$c);		# $c is "eax"
4640Sstevel@tonic-gate
4650Sstevel@tonic-gate	&function_end($name);
4660Sstevel@tonic-gate	}
4670Sstevel@tonic-gate
4680Sstevel@tonic-gatesub bn_sub_part_words
4690Sstevel@tonic-gate	{
4700Sstevel@tonic-gate	local($name)=@_;
4710Sstevel@tonic-gate
4720Sstevel@tonic-gate	&function_begin($name,"");
4730Sstevel@tonic-gate
4740Sstevel@tonic-gate	&comment("");
4750Sstevel@tonic-gate	$a="esi";
4760Sstevel@tonic-gate	$b="edi";
4770Sstevel@tonic-gate	$c="eax";
4780Sstevel@tonic-gate	$r="ebx";
4790Sstevel@tonic-gate	$tmp1="ecx";
4800Sstevel@tonic-gate	$tmp2="edx";
4810Sstevel@tonic-gate	$num="ebp";
4820Sstevel@tonic-gate
4830Sstevel@tonic-gate	&mov($r,&wparam(0));	# get r
4840Sstevel@tonic-gate	 &mov($a,&wparam(1));	# get a
4850Sstevel@tonic-gate	&mov($b,&wparam(2));	# get b
4860Sstevel@tonic-gate	 &mov($num,&wparam(3));	# get num
4870Sstevel@tonic-gate	&xor($c,$c);		# clear carry
4880Sstevel@tonic-gate	 &and($num,0xfffffff8);	# num / 8
4890Sstevel@tonic-gate
4900Sstevel@tonic-gate	&jz(&label("aw_finish"));
4910Sstevel@tonic-gate
4920Sstevel@tonic-gate	&set_label("aw_loop",0);
4930Sstevel@tonic-gate	for ($i=0; $i<8; $i++)
4940Sstevel@tonic-gate		{
4950Sstevel@tonic-gate		&comment("Round $i");
4960Sstevel@tonic-gate
4970Sstevel@tonic-gate		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
4980Sstevel@tonic-gate		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
4990Sstevel@tonic-gate		&sub($tmp1,$c);
5000Sstevel@tonic-gate		 &mov($c,0);
5010Sstevel@tonic-gate		&adc($c,$c);
5020Sstevel@tonic-gate		 &sub($tmp1,$tmp2);
5030Sstevel@tonic-gate		&adc($c,0);
5040Sstevel@tonic-gate		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
5050Sstevel@tonic-gate		}
5060Sstevel@tonic-gate
5070Sstevel@tonic-gate	&comment("");
5080Sstevel@tonic-gate	&add($a,32);
5090Sstevel@tonic-gate	 &add($b,32);
5100Sstevel@tonic-gate	&add($r,32);
5110Sstevel@tonic-gate	 &sub($num,8);
5120Sstevel@tonic-gate	&jnz(&label("aw_loop"));
5130Sstevel@tonic-gate
5140Sstevel@tonic-gate	&set_label("aw_finish",0);
5150Sstevel@tonic-gate	&mov($num,&wparam(3));	# get num
5160Sstevel@tonic-gate	&and($num,7);
5170Sstevel@tonic-gate	 &jz(&label("aw_end"));
5180Sstevel@tonic-gate
5190Sstevel@tonic-gate	for ($i=0; $i<7; $i++)
5200Sstevel@tonic-gate		{
5210Sstevel@tonic-gate		&comment("Tail Round $i");
5220Sstevel@tonic-gate		&mov($tmp1,&DWP(0,$a,"",0));	# *a
5230Sstevel@tonic-gate		 &mov($tmp2,&DWP(0,$b,"",0));# *b
5240Sstevel@tonic-gate		&sub($tmp1,$c);
5250Sstevel@tonic-gate		 &mov($c,0);
5260Sstevel@tonic-gate		&adc($c,$c);
5270Sstevel@tonic-gate		 &sub($tmp1,$tmp2);
5280Sstevel@tonic-gate		&adc($c,0);
5290Sstevel@tonic-gate		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
5300Sstevel@tonic-gate		&add($a, 4);
5310Sstevel@tonic-gate		&add($b, 4);
5320Sstevel@tonic-gate		&add($r, 4);
5330Sstevel@tonic-gate		 &dec($num) if ($i != 6);
5340Sstevel@tonic-gate		 &jz(&label("aw_end")) if ($i != 6);
5350Sstevel@tonic-gate		}
5360Sstevel@tonic-gate	&set_label("aw_end",0);
5370Sstevel@tonic-gate
5380Sstevel@tonic-gate	&cmp(&wparam(4),0);
5390Sstevel@tonic-gate	&je(&label("pw_end"));
5400Sstevel@tonic-gate
5410Sstevel@tonic-gate	&mov($num,&wparam(4));	# get dl
5420Sstevel@tonic-gate	&cmp($num,0);
5430Sstevel@tonic-gate	&je(&label("pw_end"));
5440Sstevel@tonic-gate	&jge(&label("pw_pos"));
5450Sstevel@tonic-gate
5460Sstevel@tonic-gate	&comment("pw_neg");
5470Sstevel@tonic-gate	&mov($tmp2,0);
5480Sstevel@tonic-gate	&sub($tmp2,$num);
5490Sstevel@tonic-gate	&mov($num,$tmp2);
5500Sstevel@tonic-gate	&and($num,0xfffffff8);	# num / 8
5510Sstevel@tonic-gate	&jz(&label("pw_neg_finish"));
5520Sstevel@tonic-gate
5530Sstevel@tonic-gate	&set_label("pw_neg_loop",0);
5540Sstevel@tonic-gate	for ($i=0; $i<8; $i++)
5550Sstevel@tonic-gate	{
5560Sstevel@tonic-gate	    &comment("dl<0 Round $i");
5570Sstevel@tonic-gate
5580Sstevel@tonic-gate	    &mov($tmp1,0);
5590Sstevel@tonic-gate	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
5600Sstevel@tonic-gate	    &sub($tmp1,$c);
5610Sstevel@tonic-gate	    &mov($c,0);
5620Sstevel@tonic-gate	    &adc($c,$c);
5630Sstevel@tonic-gate	    &sub($tmp1,$tmp2);
5640Sstevel@tonic-gate	    &adc($c,0);
5650Sstevel@tonic-gate	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
5660Sstevel@tonic-gate	}
5670Sstevel@tonic-gate
5680Sstevel@tonic-gate	&comment("");
5690Sstevel@tonic-gate	&add($b,32);
5700Sstevel@tonic-gate	&add($r,32);
5710Sstevel@tonic-gate	&sub($num,8);
5720Sstevel@tonic-gate	&jnz(&label("pw_neg_loop"));
5730Sstevel@tonic-gate
5740Sstevel@tonic-gate	&set_label("pw_neg_finish",0);
5750Sstevel@tonic-gate	&mov($tmp2,&wparam(4));	# get dl
5760Sstevel@tonic-gate	&mov($num,0);
5770Sstevel@tonic-gate	&sub($num,$tmp2);
5780Sstevel@tonic-gate	&and($num,7);
5790Sstevel@tonic-gate	&jz(&label("pw_end"));
5800Sstevel@tonic-gate
5810Sstevel@tonic-gate	for ($i=0; $i<7; $i++)
5820Sstevel@tonic-gate	{
5830Sstevel@tonic-gate	    &comment("dl<0 Tail Round $i");
5840Sstevel@tonic-gate	    &mov($tmp1,0);
5850Sstevel@tonic-gate	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
5860Sstevel@tonic-gate	    &sub($tmp1,$c);
5870Sstevel@tonic-gate	    &mov($c,0);
5880Sstevel@tonic-gate	    &adc($c,$c);
5890Sstevel@tonic-gate	    &sub($tmp1,$tmp2);
5900Sstevel@tonic-gate	    &adc($c,0);
5910Sstevel@tonic-gate	    &dec($num) if ($i != 6);
5920Sstevel@tonic-gate	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
5930Sstevel@tonic-gate	    &jz(&label("pw_end")) if ($i != 6);
5940Sstevel@tonic-gate	}
5950Sstevel@tonic-gate
5960Sstevel@tonic-gate	&jmp(&label("pw_end"));
5970Sstevel@tonic-gate
5980Sstevel@tonic-gate	&set_label("pw_pos",0);
5990Sstevel@tonic-gate
6000Sstevel@tonic-gate	&and($num,0xfffffff8);	# num / 8
6010Sstevel@tonic-gate	&jz(&label("pw_pos_finish"));
6020Sstevel@tonic-gate
6030Sstevel@tonic-gate	&set_label("pw_pos_loop",0);
6040Sstevel@tonic-gate
6050Sstevel@tonic-gate	for ($i=0; $i<8; $i++)
6060Sstevel@tonic-gate	{
6070Sstevel@tonic-gate	    &comment("dl>0 Round $i");
6080Sstevel@tonic-gate
6090Sstevel@tonic-gate	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
6100Sstevel@tonic-gate	    &sub($tmp1,$c);
6110Sstevel@tonic-gate	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
6120Sstevel@tonic-gate	    &jnc(&label("pw_nc".$i));
6130Sstevel@tonic-gate	}
6140Sstevel@tonic-gate
6150Sstevel@tonic-gate	&comment("");
6160Sstevel@tonic-gate	&add($a,32);
6170Sstevel@tonic-gate	&add($r,32);
6180Sstevel@tonic-gate	&sub($num,8);
6190Sstevel@tonic-gate	&jnz(&label("pw_pos_loop"));
6200Sstevel@tonic-gate
6210Sstevel@tonic-gate	&set_label("pw_pos_finish",0);
6220Sstevel@tonic-gate	&mov($num,&wparam(4));	# get dl
6230Sstevel@tonic-gate	&and($num,7);
6240Sstevel@tonic-gate	&jz(&label("pw_end"));
6250Sstevel@tonic-gate
6260Sstevel@tonic-gate	for ($i=0; $i<7; $i++)
6270Sstevel@tonic-gate	{
6280Sstevel@tonic-gate	    &comment("dl>0 Tail Round $i");
6290Sstevel@tonic-gate	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
6300Sstevel@tonic-gate	    &sub($tmp1,$c);
6310Sstevel@tonic-gate	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
6320Sstevel@tonic-gate	    &jnc(&label("pw_tail_nc".$i));
6330Sstevel@tonic-gate	    &dec($num) if ($i != 6);
6340Sstevel@tonic-gate	    &jz(&label("pw_end")) if ($i != 6);
6350Sstevel@tonic-gate	}
6360Sstevel@tonic-gate	&mov($c,1);
6370Sstevel@tonic-gate	&jmp(&label("pw_end"));
6380Sstevel@tonic-gate
6390Sstevel@tonic-gate	&set_label("pw_nc_loop",0);
6400Sstevel@tonic-gate	for ($i=0; $i<8; $i++)
6410Sstevel@tonic-gate	{
6420Sstevel@tonic-gate	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
6430Sstevel@tonic-gate	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
6440Sstevel@tonic-gate	    &set_label("pw_nc".$i,0);
6450Sstevel@tonic-gate	}
6460Sstevel@tonic-gate
6470Sstevel@tonic-gate	&comment("");
6480Sstevel@tonic-gate	&add($a,32);
6490Sstevel@tonic-gate	&add($r,32);
6500Sstevel@tonic-gate	&sub($num,8);
6510Sstevel@tonic-gate	&jnz(&label("pw_nc_loop"));
6520Sstevel@tonic-gate
6530Sstevel@tonic-gate	&mov($num,&wparam(4));	# get dl
6540Sstevel@tonic-gate	&and($num,7);
6550Sstevel@tonic-gate	&jz(&label("pw_nc_end"));
6560Sstevel@tonic-gate
6570Sstevel@tonic-gate	for ($i=0; $i<7; $i++)
6580Sstevel@tonic-gate	{
6590Sstevel@tonic-gate	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
6600Sstevel@tonic-gate	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
6610Sstevel@tonic-gate	    &set_label("pw_tail_nc".$i,0);
6620Sstevel@tonic-gate	    &dec($num) if ($i != 6);
6630Sstevel@tonic-gate	    &jz(&label("pw_nc_end")) if ($i != 6);
6640Sstevel@tonic-gate	}
6650Sstevel@tonic-gate
6660Sstevel@tonic-gate	&set_label("pw_nc_end",0);
6670Sstevel@tonic-gate	&mov($c,0);
6680Sstevel@tonic-gate
6690Sstevel@tonic-gate	&set_label("pw_end",0);
6700Sstevel@tonic-gate
6710Sstevel@tonic-gate#	&mov("eax",$c);		# $c is "eax"
6720Sstevel@tonic-gate
6730Sstevel@tonic-gate	&function_end($name);
6740Sstevel@tonic-gate	}
6750Sstevel@tonic-gate
676