xref: /onnv-gate/usr/src/common/crypto/md5/md5_byteswap.h (revision 11141:64e602617ba4)
11015Swesolows /*
21015Swesolows  * CDDL HEADER START
31015Swesolows  *
41015Swesolows  * The contents of this file are subject to the terms of the
51015Swesolows  * Common Development and Distribution License (the "License").
61015Swesolows  * You may not use this file except in compliance with the License.
71015Swesolows  *
81015Swesolows  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
91015Swesolows  * or http://www.opensolaris.org/os/licensing.
101015Swesolows  * See the License for the specific language governing permissions
111015Swesolows  * and limitations under the License.
121015Swesolows  *
131015Swesolows  * When distributing Covered Code, include this CDDL HEADER in each
141015Swesolows  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
151015Swesolows  * If applicable, add the following below this CDDL HEADER, with the
161015Swesolows  * fields enclosed by brackets "[]" replaced with your own identifying
171015Swesolows  * information: Portions Copyright [yyyy] [name of copyright owner]
181015Swesolows  *
191015Swesolows  * CDDL HEADER END
201015Swesolows  */
211015Swesolows 
221015Swesolows /*
23*11141Sopensolaris@drydog.com  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
241015Swesolows  * Use is subject to license terms.
251015Swesolows  */
261015Swesolows 
271015Swesolows #ifndef	_MD5_BYTESWAP_H
281015Swesolows #define	_MD5_BYTESWAP_H
291015Swesolows 
301015Swesolows /*
311015Swesolows  * definitions for inline functions for little-endian loads.
321015Swesolows  *
331015Swesolows  * This file has special definitions for UltraSPARC architectures,
341015Swesolows  * which have a special address space identifier for loading 32 and 16 bit
351015Swesolows  * integers in little-endian byte order.
361015Swesolows  *
371015Swesolows  * This file and common/crypto/md5/sparc/sun4[uv]/byteswap.il implement the
381015Swesolows  * same thing and must be changed together.
391015Swesolows  */
401015Swesolows 
417421SDaniel.Anderson@Sun.COM #include <sys/types.h>
421015Swesolows #if defined(__sparc)
431015Swesolows #include <v9/sys/asi.h>
447421SDaniel.Anderson@Sun.COM #elif defined(_LITTLE_ENDIAN)
457421SDaniel.Anderson@Sun.COM #include <sys/byteorder.h>
461015Swesolows #endif
471015Swesolows 
481015Swesolows #ifdef	__cplusplus
491015Swesolows extern "C" {
501015Swesolows #endif
511015Swesolows 
521015Swesolows #if defined(_LITTLE_ENDIAN)
531015Swesolows 
541015Swesolows /*
551015Swesolows  * Little-endian optimization:  I don't need to do any weirdness.   On
561015Swesolows  * some little-endian boxen, I'll have to do alignment checks, but I can do
571015Swesolows  * that below.
581015Swesolows  */
591015Swesolows 
601015Swesolows #if !defined(__i386) && !defined(__amd64)
611015Swesolows /*
621015Swesolows  * i386 and amd64 don't require aligned 4-byte loads.  The symbol
631015Swesolows  * _MD5_CHECK_ALIGNMENT indicates below whether the MD5Transform function
641015Swesolows  * requires alignment checking.
651015Swesolows  */
661015Swesolows #define	_MD5_CHECK_ALIGNMENT
671015Swesolows #endif /* !__i386 && !__amd64 */
681015Swesolows 
69*11141Sopensolaris@drydog.com #define	LOAD_LITTLE_32(addr)	(*(uint32_t *)(void *)(addr))
701015Swesolows 
711015Swesolows #else	/* !_LITTLE_ENDIAN */
721015Swesolows 
731015Swesolows /*
741015Swesolows  * sparc v9/v8plus optimization:
751015Swesolows  *
761015Swesolows  * on the sparc v9/v8plus, we can load data little endian.  however, since
771015Swesolows  * the compiler doesn't have direct support for little endian, we
781015Swesolows  * link to an assembly-language routine `load_little_32' to do
791015Swesolows  * the magic.  note that special care must be taken to ensure the
801015Swesolows  * address is 32-bit aligned -- in the interest of speed, we don't
811015Swesolows  * check to make sure, since careful programming can guarantee this
821015Swesolows  * for us.
831015Swesolows  */
841015Swesolows #if defined(sun4u)
851015Swesolows 
861015Swesolows /* Define alignment check because we can 4-byte load as little endian. */
871015Swesolows #define	_MD5_CHECK_ALIGNMENT
88*11141Sopensolaris@drydog.com #define	LOAD_LITTLE_32(addr)    load_little_32((uint32_t *)(void *)(addr))
891015Swesolows 
901015Swesolows #if !defined(__lint) && defined(__GNUC__)
911015Swesolows 
921015Swesolows static __inline__ uint32_t
931015Swesolows load_little_32(uint32_t *addr)
941015Swesolows {
951015Swesolows 	uint32_t value;
961015Swesolows 
971015Swesolows 	__asm__(
981015Swesolows 	    "lduwa	[%1] %2, %0\n\t"
997421SDaniel.Anderson@Sun.COM 	    : "=r" (value)
1007421SDaniel.Anderson@Sun.COM 	    : "r" (addr), "i" (ASI_PL));
1011015Swesolows 
1021015Swesolows 	return (value);
1031015Swesolows }
1041015Swesolows #endif	/* !__lint && __GNUC__ */
1051015Swesolows 
1061015Swesolows #if !defined(__GNUC__)
1071015Swesolows extern	uint32_t load_little_32(uint32_t *);
1081015Swesolows #endif	/* !__GNUC__ */
1091015Swesolows 
1101694Sdarrenm /* Placate lint */
1111694Sdarrenm #if defined(__lint)
1121694Sdarrenm uint32_t
1131694Sdarrenm load_little_32(uint32_t *addr)
1141694Sdarrenm {
1151694Sdarrenm 	return (*addr);
1161694Sdarrenm }
1171694Sdarrenm #endif	/* __lint */
1181694Sdarrenm 
1197421SDaniel.Anderson@Sun.COM #elif defined(_LITTLE_ENDIAN)
1207421SDaniel.Anderson@Sun.COM #define	LOAD_LITTLE_32(addr)	htonl(addr)
1211694Sdarrenm 
1227421SDaniel.Anderson@Sun.COM #else
1231694Sdarrenm /* big endian -- will work on little endian, but slowly */
1241694Sdarrenm /* Since we do byte operations, we don't have to check for alignment. */
1251694Sdarrenm #define	LOAD_LITTLE_32(addr)	\
1261694Sdarrenm 	((addr)[0] | ((addr)[1] << 8) | ((addr)[2] << 16) | ((addr)[3] << 24))
1271694Sdarrenm #endif	/* sun4u */
1281694Sdarrenm 
1291015Swesolows #if defined(sun4v)
1301015Swesolows 
1311015Swesolows /*
1321015Swesolows  * For N1 want to minimize number of arithmetic operations. This is best
1331015Swesolows  * achieved by using the %asi register to specify ASI for the lduwa operations.
1341015Swesolows  * Also, have a separate inline template for each word, so can utilize the
1351015Swesolows  * immediate offset in lduwa, without relying on the compiler to do the right
1361015Swesolows  * thing.
1371015Swesolows  *
1381015Swesolows  * Moving to 64-bit loads might also be beneficial.
1391015Swesolows  */
1401015Swesolows #define	LOAD_LITTLE_32_0(addr)	load_little_32_0((uint32_t *)(addr))
1411015Swesolows #define	LOAD_LITTLE_32_1(addr)	load_little_32_1((uint32_t *)(addr))
1421015Swesolows #define	LOAD_LITTLE_32_2(addr)	load_little_32_2((uint32_t *)(addr))
1431015Swesolows #define	LOAD_LITTLE_32_3(addr)	load_little_32_3((uint32_t *)(addr))
1441015Swesolows #define	LOAD_LITTLE_32_4(addr)	load_little_32_4((uint32_t *)(addr))
1451015Swesolows #define	LOAD_LITTLE_32_5(addr)	load_little_32_5((uint32_t *)(addr))
1461015Swesolows #define	LOAD_LITTLE_32_6(addr)	load_little_32_6((uint32_t *)(addr))
1471015Swesolows #define	LOAD_LITTLE_32_7(addr)	load_little_32_7((uint32_t *)(addr))
1481015Swesolows #define	LOAD_LITTLE_32_8(addr)	load_little_32_8((uint32_t *)(addr))
1491015Swesolows #define	LOAD_LITTLE_32_9(addr)	load_little_32_9((uint32_t *)(addr))
1501015Swesolows #define	LOAD_LITTLE_32_a(addr)	load_little_32_a((uint32_t *)(addr))
1511015Swesolows #define	LOAD_LITTLE_32_b(addr)	load_little_32_b((uint32_t *)(addr))
1521015Swesolows #define	LOAD_LITTLE_32_c(addr)	load_little_32_c((uint32_t *)(addr))
1531015Swesolows #define	LOAD_LITTLE_32_d(addr)	load_little_32_d((uint32_t *)(addr))
1541015Swesolows #define	LOAD_LITTLE_32_e(addr)	load_little_32_e((uint32_t *)(addr))
1551015Swesolows #define	LOAD_LITTLE_32_f(addr)	load_little_32_f((uint32_t *)(addr))
1561015Swesolows 
1571015Swesolows #if !defined(__lint) && defined(__GNUC__)
1581015Swesolows 
1591015Swesolows /*
1601015Swesolows  * This actually sets the ASI register, not necessarily to ASI_PL.
1611015Swesolows  */
1621015Swesolows static __inline__ void
1631015Swesolows set_little(uint8_t asi)
1641015Swesolows {
1651015Swesolows 	__asm__ __volatile__(
1667421SDaniel.Anderson@Sun.COM 	    "wr	%%g0, %0, %%asi\n\t"
1677421SDaniel.Anderson@Sun.COM 	    : /* Nothing */
1687421SDaniel.Anderson@Sun.COM 	    : "r" (asi));
1691015Swesolows }
1701015Swesolows 
1711015Swesolows static __inline__ uint8_t
1721015Swesolows get_little(void)
1731015Swesolows {
1741015Swesolows 	uint8_t asi;
1751015Swesolows 
1761015Swesolows 	__asm__ __volatile__(
1777421SDaniel.Anderson@Sun.COM 	    "rd	%%asi, %0\n\t"
1787421SDaniel.Anderson@Sun.COM 	    : "=r" (asi));
1791015Swesolows 
1801015Swesolows 	return (asi);
1811015Swesolows }
1821015Swesolows 
1831015Swesolows /*
1841015Swesolows  * We have 16 functions which differ only in the offset from which they
1851015Swesolows  * load.  Use this preprocessor template to simplify maintenance.  Its
1861015Swesolows  * argument is the offset in hex, without the 0x.
1871015Swesolows  */
1881015Swesolows #define	LL_TEMPLATE(__off)			\
1891015Swesolows static __inline__ uint32_t			\
1901015Swesolows load_little_32_##__off(uint32_t *addr)		\
1911015Swesolows {						\
1921015Swesolows 	uint32_t value;				\
1931015Swesolows 	__asm__(				\
1941015Swesolows 		"lduwa	[%1 + %2]%%asi, %0\n\t"	\
1951015Swesolows 	: "=r" (value)				\
1961015Swesolows 	: "r" (addr), "i" ((0x##__off) << 2));	\
1971015Swesolows 	return (value);				\
1981015Swesolows }
1991015Swesolows 
2001015Swesolows LL_TEMPLATE(0)
2011015Swesolows LL_TEMPLATE(1)
2021015Swesolows LL_TEMPLATE(2)
2031015Swesolows LL_TEMPLATE(3)
2041015Swesolows LL_TEMPLATE(4)
2051015Swesolows LL_TEMPLATE(5)
2061015Swesolows LL_TEMPLATE(6)
2071015Swesolows LL_TEMPLATE(7)
2081015Swesolows LL_TEMPLATE(8)
2091015Swesolows LL_TEMPLATE(9)
2101015Swesolows LL_TEMPLATE(a)
2111015Swesolows LL_TEMPLATE(b)
2121015Swesolows LL_TEMPLATE(c)
2131015Swesolows LL_TEMPLATE(d)
2141015Swesolows LL_TEMPLATE(e)
2151015Swesolows LL_TEMPLATE(f)
2161015Swesolows #undef	LL_TEMPLATE
2171015Swesolows 
2181015Swesolows #endif	/* !__lint && __GNUC__ */
2191015Swesolows 
2201015Swesolows #if !defined(__GNUC__)
2211015Swesolows /*
2221015Swesolows  * Using the %asi register to achieve little endian loads - register
2231015Swesolows  * is set using a inline template.
2241015Swesolows  *
2251015Swesolows  * Saves a few arithmetic ops as can now use an immediate offset with the
2261015Swesolows  * lduwa instructions.
2271015Swesolows  */
2281015Swesolows extern void set_little(uint32_t);
2291015Swesolows extern uint32_t get_little(void);
2301015Swesolows 
2311015Swesolows extern	uint32_t load_little_32_0(uint32_t *);
2321015Swesolows extern	uint32_t load_little_32_1(uint32_t *);
2331015Swesolows extern	uint32_t load_little_32_2(uint32_t *);
2341015Swesolows extern	uint32_t load_little_32_3(uint32_t *);
2351015Swesolows extern	uint32_t load_little_32_4(uint32_t *);
2361015Swesolows extern	uint32_t load_little_32_5(uint32_t *);
2371015Swesolows extern	uint32_t load_little_32_6(uint32_t *);
2381015Swesolows extern	uint32_t load_little_32_7(uint32_t *);
2391015Swesolows extern	uint32_t load_little_32_8(uint32_t *);
2401015Swesolows extern	uint32_t load_little_32_9(uint32_t *);
2411015Swesolows extern	uint32_t load_little_32_a(uint32_t *);
2421015Swesolows extern	uint32_t load_little_32_b(uint32_t *);
2431015Swesolows extern	uint32_t load_little_32_c(uint32_t *);
2441015Swesolows extern	uint32_t load_little_32_d(uint32_t *);
2451015Swesolows extern	uint32_t load_little_32_e(uint32_t *);
2461015Swesolows extern	uint32_t load_little_32_f(uint32_t *);
2471015Swesolows #endif	/* !__GNUC__ */
2481015Swesolows #endif	/* sun4v */
2491015Swesolows 
2501015Swesolows #endif	/* _LITTLE_ENDIAN */
2511015Swesolows 
2521015Swesolows #ifdef	__cplusplus
2531015Swesolows }
2541015Swesolows #endif
2551015Swesolows 
2561015Swesolows #endif	/* !_MD5_BYTESWAP_H */
257