1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#ifndef	_MD5_BYTESWAP_H
28#define	_MD5_BYTESWAP_H
29
30/*
31 * definitions for inline functions for little-endian loads.
32 *
33 * This file has special definitions for UltraSPARC architectures,
34 * which have a special address space identifier for loading 32 and 16 bit
35 * integers in little-endian byte order.
36 *
37 * This file and common/crypto/md5/sparc/sun4[uv]/byteswap.il implement the
38 * same thing and must be changed together.
39 */
40
41#include <sys/types.h>
42#if defined(__sparc)
43#include <v9/sys/asi.h>
44#elif defined(_LITTLE_ENDIAN)
45#include <sys/byteorder.h>
46#endif
47
48#ifdef	__cplusplus
49extern "C" {
50#endif
51
52#if defined(_LITTLE_ENDIAN)
53
54/*
55 * Little-endian optimization:  I don't need to do any weirdness.   On
56 * some little-endian boxen, I'll have to do alignment checks, but I can do
57 * that below.
58 */
59
60#if !defined(__i386) && !defined(__amd64)
61/*
62 * i386 and amd64 don't require aligned 4-byte loads.  The symbol
63 * _MD5_CHECK_ALIGNMENT indicates below whether the MD5Transform function
64 * requires alignment checking.
65 */
66#define	_MD5_CHECK_ALIGNMENT
67#endif /* !__i386 && !__amd64 */
68
69#define	LOAD_LITTLE_32(addr)	(*(uint32_t *)(void *)(addr))
70
71#else	/* !_LITTLE_ENDIAN */
72
73/*
74 * sparc v9/v8plus optimization:
75 *
76 * on the sparc v9/v8plus, we can load data little endian.  however, since
77 * the compiler doesn't have direct support for little endian, we
78 * link to an assembly-language routine `load_little_32' to do
79 * the magic.  note that special care must be taken to ensure the
80 * address is 32-bit aligned -- in the interest of speed, we don't
81 * check to make sure, since careful programming can guarantee this
82 * for us.
83 */
84#if defined(sun4u)
85
86/* Define alignment check because we can 4-byte load as little endian. */
87#define	_MD5_CHECK_ALIGNMENT
88#define	LOAD_LITTLE_32(addr)    load_little_32((uint32_t *)(void *)(addr))
89
90#if !defined(__lint) && defined(__GNUC__)
91
92static __inline__ uint32_t
93load_little_32(uint32_t *addr)
94{
95	uint32_t value;
96
97	__asm__(
98	    "lduwa	[%1] %2, %0\n\t"
99	    : "=r" (value)
100	    : "r" (addr), "i" (ASI_PL));
101
102	return (value);
103}
104#endif	/* !__lint && __GNUC__ */
105
106#if !defined(__GNUC__)
107extern	uint32_t load_little_32(uint32_t *);
108#endif	/* !__GNUC__ */
109
110/* Placate lint */
111#if defined(__lint)
112uint32_t
113load_little_32(uint32_t *addr)
114{
115	return (*addr);
116}
117#endif	/* __lint */
118
119#elif defined(_LITTLE_ENDIAN)
120#define	LOAD_LITTLE_32(addr)	htonl(addr)
121
122#else
123/* big endian -- will work on little endian, but slowly */
124/* Since we do byte operations, we don't have to check for alignment. */
125#define	LOAD_LITTLE_32(addr)	\
126	((addr)[0] | ((addr)[1] << 8) | ((addr)[2] << 16) | ((addr)[3] << 24))
127#endif	/* sun4u */
128
129#if defined(sun4v)
130
131/*
132 * For N1 want to minimize number of arithmetic operations. This is best
133 * achieved by using the %asi register to specify ASI for the lduwa operations.
134 * Also, have a separate inline template for each word, so can utilize the
135 * immediate offset in lduwa, without relying on the compiler to do the right
136 * thing.
137 *
138 * Moving to 64-bit loads might also be beneficial.
139 */
140#define	LOAD_LITTLE_32_0(addr)	load_little_32_0((uint32_t *)(addr))
141#define	LOAD_LITTLE_32_1(addr)	load_little_32_1((uint32_t *)(addr))
142#define	LOAD_LITTLE_32_2(addr)	load_little_32_2((uint32_t *)(addr))
143#define	LOAD_LITTLE_32_3(addr)	load_little_32_3((uint32_t *)(addr))
144#define	LOAD_LITTLE_32_4(addr)	load_little_32_4((uint32_t *)(addr))
145#define	LOAD_LITTLE_32_5(addr)	load_little_32_5((uint32_t *)(addr))
146#define	LOAD_LITTLE_32_6(addr)	load_little_32_6((uint32_t *)(addr))
147#define	LOAD_LITTLE_32_7(addr)	load_little_32_7((uint32_t *)(addr))
148#define	LOAD_LITTLE_32_8(addr)	load_little_32_8((uint32_t *)(addr))
149#define	LOAD_LITTLE_32_9(addr)	load_little_32_9((uint32_t *)(addr))
150#define	LOAD_LITTLE_32_a(addr)	load_little_32_a((uint32_t *)(addr))
151#define	LOAD_LITTLE_32_b(addr)	load_little_32_b((uint32_t *)(addr))
152#define	LOAD_LITTLE_32_c(addr)	load_little_32_c((uint32_t *)(addr))
153#define	LOAD_LITTLE_32_d(addr)	load_little_32_d((uint32_t *)(addr))
154#define	LOAD_LITTLE_32_e(addr)	load_little_32_e((uint32_t *)(addr))
155#define	LOAD_LITTLE_32_f(addr)	load_little_32_f((uint32_t *)(addr))
156
157#if !defined(__lint) && defined(__GNUC__)
158
159/*
160 * This actually sets the ASI register, not necessarily to ASI_PL.
161 */
162static __inline__ void
163set_little(uint8_t asi)
164{
165	__asm__ __volatile__(
166	    "wr	%%g0, %0, %%asi\n\t"
167	    : /* Nothing */
168	    : "r" (asi));
169}
170
171static __inline__ uint8_t
172get_little(void)
173{
174	uint8_t asi;
175
176	__asm__ __volatile__(
177	    "rd	%%asi, %0\n\t"
178	    : "=r" (asi));
179
180	return (asi);
181}
182
183/*
184 * We have 16 functions which differ only in the offset from which they
185 * load.  Use this preprocessor template to simplify maintenance.  Its
186 * argument is the offset in hex, without the 0x.
187 */
188#define	LL_TEMPLATE(__off)			\
189static __inline__ uint32_t			\
190load_little_32_##__off(uint32_t *addr)		\
191{						\
192	uint32_t value;				\
193	__asm__(				\
194		"lduwa	[%1 + %2]%%asi, %0\n\t"	\
195	: "=r" (value)				\
196	: "r" (addr), "i" ((0x##__off) << 2));	\
197	return (value);				\
198}
199
200LL_TEMPLATE(0)
201LL_TEMPLATE(1)
202LL_TEMPLATE(2)
203LL_TEMPLATE(3)
204LL_TEMPLATE(4)
205LL_TEMPLATE(5)
206LL_TEMPLATE(6)
207LL_TEMPLATE(7)
208LL_TEMPLATE(8)
209LL_TEMPLATE(9)
210LL_TEMPLATE(a)
211LL_TEMPLATE(b)
212LL_TEMPLATE(c)
213LL_TEMPLATE(d)
214LL_TEMPLATE(e)
215LL_TEMPLATE(f)
216#undef	LL_TEMPLATE
217
218#endif	/* !__lint && __GNUC__ */
219
220#if !defined(__GNUC__)
221/*
222 * Using the %asi register to achieve little endian loads - register
223 * is set using a inline template.
224 *
225 * Saves a few arithmetic ops as can now use an immediate offset with the
226 * lduwa instructions.
227 */
228extern void set_little(uint32_t);
229extern uint32_t get_little(void);
230
231extern	uint32_t load_little_32_0(uint32_t *);
232extern	uint32_t load_little_32_1(uint32_t *);
233extern	uint32_t load_little_32_2(uint32_t *);
234extern	uint32_t load_little_32_3(uint32_t *);
235extern	uint32_t load_little_32_4(uint32_t *);
236extern	uint32_t load_little_32_5(uint32_t *);
237extern	uint32_t load_little_32_6(uint32_t *);
238extern	uint32_t load_little_32_7(uint32_t *);
239extern	uint32_t load_little_32_8(uint32_t *);
240extern	uint32_t load_little_32_9(uint32_t *);
241extern	uint32_t load_little_32_a(uint32_t *);
242extern	uint32_t load_little_32_b(uint32_t *);
243extern	uint32_t load_little_32_c(uint32_t *);
244extern	uint32_t load_little_32_d(uint32_t *);
245extern	uint32_t load_little_32_e(uint32_t *);
246extern	uint32_t load_little_32_f(uint32_t *);
247#endif	/* !__GNUC__ */
248#endif	/* sun4v */
249
250#endif	/* _LITTLE_ENDIAN */
251
252#ifdef	__cplusplus
253}
254#endif
255
256#endif	/* !_MD5_BYTESWAP_H */
257