1/*-
2 * Copyright (c) 2013 Ed Schouten <ed@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD$");
29
30#include <sys/param.h>
31#include <sys/stdatomic.h>
32#include <sys/types.h>
33
34#include <machine/atomic.h>
35#include <machine/cpufunc.h>
36#include <machine/sysarch.h>
37
38/*
39 * Executing statements with interrupts disabled.
40 */
41
42#if defined(_KERNEL) && !defined(SMP)
43#define	WITHOUT_INTERRUPTS(s) do {					\
44	register_t regs;						\
45									\
46	regs = intr_disable();						\
47	do s while (0);							\
48	intr_restore(regs);						\
49} while (0)
50#endif /* _KERNEL && !SMP */
51
52/*
53 * Memory barriers.
54 *
55 * It turns out __sync_synchronize() does not emit any code when used
56 * with GCC 4.2. Implement our own version that does work reliably.
57 *
58 * Although __sync_lock_test_and_set() should only perform an acquire
59 * barrier, make it do a full barrier like the other functions. This
60 * should make <stdatomic.h>'s atomic_exchange_explicit() work reliably.
61 */
62
63#if defined(_KERNEL) && !defined(SMP)
64static inline void
65do_sync(void)
66{
67
68	__asm volatile ("" : : : "memory");
69}
70#elif __ARM_ARCH >= 6
71static inline void
72do_sync(void)
73{
74
75	dmb();
76}
77#endif
78
79#if defined(__CLANG_ATOMICS) || defined(__GNUC_ATOMICS)
80
81/*
82 * New C11 __atomic_* API.
83 */
84
85/* ARMv6+ systems should be supported by the compiler. */
86#if __ARM_ARCH <= 5
87
88/* Clang doesn't allow us to reimplement builtins without this. */
89#ifdef __clang__
90#pragma redefine_extname __sync_synchronize_ext __sync_synchronize
91#define __sync_synchronize __sync_synchronize_ext
92#endif
93
94void
95__sync_synchronize(void)
96{
97}
98
99#ifdef _KERNEL
100
101#ifdef SMP
102#error "On SMP systems we should have proper atomic operations."
103#endif
104
105/*
106 * On uniprocessor systems, we can perform the atomic operations by
107 * disabling interrupts.
108 */
109
110#define	EMIT_LOAD_N(N, uintN_t)						\
111uintN_t									\
112__atomic_load_##N(uintN_t *mem, int model __unused)			\
113{									\
114	uintN_t ret;							\
115									\
116	WITHOUT_INTERRUPTS({						\
117		ret = *mem;						\
118	});								\
119	return (ret);							\
120}
121
122#define	EMIT_STORE_N(N, uintN_t)					\
123void									\
124__atomic_store_##N(uintN_t *mem, uintN_t val, int model __unused)	\
125{									\
126									\
127	WITHOUT_INTERRUPTS({						\
128		*mem = val;						\
129	});								\
130}
131
132#define	EMIT_COMPARE_EXCHANGE_N(N, uintN_t)				\
133_Bool									\
134__atomic_compare_exchange_##N(uintN_t *mem, uintN_t *expected,		\
135    uintN_t desired, int success __unused, int failure __unused)	\
136{									\
137	_Bool ret;							\
138									\
139	WITHOUT_INTERRUPTS({						\
140		if (*mem == *expected) {				\
141			*mem = desired;					\
142			ret = 1;					\
143		} else {						\
144			*expected = *mem;				\
145			ret = 0;					\
146		}							\
147	});								\
148	return (ret);							\
149}
150
151#define	EMIT_FETCH_OP_N(N, uintN_t, name, op)				\
152uintN_t									\
153__atomic_##name##_##N(uintN_t *mem, uintN_t val, int model __unused)	\
154{									\
155	uintN_t ret;							\
156									\
157	WITHOUT_INTERRUPTS({						\
158		ret = *mem;						\
159		*mem op val;						\
160	});								\
161	return (ret);							\
162}
163
164#define	EMIT_ALL_OPS_N(N, uintN_t)					\
165EMIT_LOAD_N(N, uintN_t)							\
166EMIT_STORE_N(N, uintN_t)						\
167EMIT_COMPARE_EXCHANGE_N(N, uintN_t)					\
168EMIT_FETCH_OP_N(N, uintN_t, exchange, =)				\
169EMIT_FETCH_OP_N(N, uintN_t, fetch_add, +=)				\
170EMIT_FETCH_OP_N(N, uintN_t, fetch_and, &=)				\
171EMIT_FETCH_OP_N(N, uintN_t, fetch_or, |=)				\
172EMIT_FETCH_OP_N(N, uintN_t, fetch_sub, -=)				\
173EMIT_FETCH_OP_N(N, uintN_t, fetch_xor, ^=)
174
175EMIT_ALL_OPS_N(1, uint8_t)
176EMIT_ALL_OPS_N(2, uint16_t)
177EMIT_ALL_OPS_N(4, uint32_t)
178EMIT_ALL_OPS_N(8, uint64_t)
179#undef	EMIT_ALL_OPS_N
180
181#else /* !_KERNEL */
182
183/*
184 * For userspace on uniprocessor systems, we can implement the atomic
185 * operations by using a Restartable Atomic Sequence. This makes the
186 * kernel restart the code from the beginning when interrupted.
187 */
188
189#define	EMIT_LOAD_N(N, uintN_t)						\
190uintN_t									\
191__atomic_load_##N(uintN_t *mem, int model __unused)			\
192{									\
193									\
194	return (*mem);							\
195}
196
197#define	EMIT_STORE_N(N, uintN_t)					\
198void									\
199__atomic_store_##N(uintN_t *mem, uintN_t val, int model __unused)	\
200{									\
201									\
202	*mem = val;							\
203}
204
205#define	EMIT_EXCHANGE_N(N, uintN_t, ldr, str)				\
206uintN_t									\
207__atomic_exchange_##N(uintN_t *mem, uintN_t val, int model __unused)	\
208{									\
209	uint32_t old, temp, ras_start;					\
210									\
211	ras_start = ARM_RAS_START;					\
212	__asm volatile (						\
213		/* Set up Restartable Atomic Sequence. */		\
214		"1:"							\
215		"\tadr   %2, 1b\n"					\
216		"\tstr   %2, [%5]\n"					\
217		"\tadr   %2, 2f\n"					\
218		"\tstr   %2, [%5, #4]\n"				\
219									\
220		"\t"ldr" %0, %4\n"	/* Load old value. */		\
221		"\t"str" %3, %1\n"	/* Store new value. */		\
222									\
223		/* Tear down Restartable Atomic Sequence. */		\
224		"2:"							\
225		"\tmov   %2, #0x00000000\n"				\
226		"\tstr   %2, [%5]\n"					\
227		"\tmov   %2, #0xffffffff\n"				\
228		"\tstr   %2, [%5, #4]\n"				\
229		: "=&r" (old), "=m" (*mem), "=&r" (temp)		\
230		: "r" (val), "m" (*mem), "r" (ras_start));		\
231	return (old);							\
232}
233
234#define	EMIT_COMPARE_EXCHANGE_N(N, uintN_t, ldr, streq)			\
235_Bool									\
236__atomic_compare_exchange_##N(uintN_t *mem, uintN_t *pexpected,		\
237    uintN_t desired, int success __unused, int failure __unused)	\
238{									\
239	uint32_t expected, old, temp, ras_start;			\
240									\
241	expected = *pexpected;						\
242	ras_start = ARM_RAS_START;					\
243	__asm volatile (						\
244		/* Set up Restartable Atomic Sequence. */		\
245		"1:"							\
246		"\tadr   %2, 1b\n"					\
247		"\tstr   %2, [%6]\n"					\
248		"\tadr   %2, 2f\n"					\
249		"\tstr   %2, [%6, #4]\n"				\
250									\
251		"\t"ldr" %0, %5\n"	/* Load old value. */		\
252		"\tcmp   %0, %3\n"	/* Compare to expected value. */\
253		"\t"streq" %4, %1\n"	/* Store new value. */		\
254									\
255		/* Tear down Restartable Atomic Sequence. */		\
256		"2:"							\
257		"\tmov   %2, #0x00000000\n"				\
258		"\tstr   %2, [%6]\n"					\
259		"\tmov   %2, #0xffffffff\n"				\
260		"\tstr   %2, [%6, #4]\n"				\
261		: "=&r" (old), "=m" (*mem), "=&r" (temp)		\
262		: "r" (expected), "r" (desired), "m" (*mem),		\
263		  "r" (ras_start));					\
264	if (old == expected) {						\
265		return (1);						\
266	} else {							\
267		*pexpected = old;					\
268		return (0);						\
269	}								\
270}
271
272#define	EMIT_FETCH_OP_N(N, uintN_t, ldr, str, name, op, ret)		\
273uintN_t									\
274__atomic_##name##_##N(uintN_t *mem, uintN_t val, int model __unused)	\
275{									\
276	uint32_t old, new, ras_start;					\
277									\
278	ras_start = ARM_RAS_START;					\
279	__asm volatile (						\
280		/* Set up Restartable Atomic Sequence. */		\
281		"1:"							\
282		"\tadr   %2, 1b\n"					\
283		"\tstr   %2, [%5]\n"					\
284		"\tadr   %2, 2f\n"					\
285		"\tstr   %2, [%5, #4]\n"				\
286									\
287		"\t"ldr" %0, %4\n"	/* Load old value. */		\
288		"\t"op"  %2, %0, %3\n"	/* Calculate new value. */	\
289		"\t"str" %2, %1\n"	/* Store new value. */		\
290									\
291		/* Tear down Restartable Atomic Sequence. */		\
292		"2:"							\
293		"\tmov   %2, #0x00000000\n"				\
294		"\tstr   %2, [%5]\n"					\
295		"\tmov   %2, #0xffffffff\n"				\
296		"\tstr   %2, [%5, #4]\n"				\
297		: "=&r" (old), "=m" (*mem), "=&r" (new)			\
298		: "r" (val), "m" (*mem), "r" (ras_start));		\
299	return (ret);							\
300}
301
302#define	EMIT_ALL_OPS_N(N, uintN_t, ldr, str, streq)			\
303EMIT_LOAD_N(N, uintN_t)							\
304EMIT_STORE_N(N, uintN_t)						\
305EMIT_EXCHANGE_N(N, uintN_t, ldr, str)					\
306EMIT_COMPARE_EXCHANGE_N(N, uintN_t, ldr, streq)				\
307EMIT_FETCH_OP_N(N, uintN_t, ldr, str, fetch_add, "add", old)		\
308EMIT_FETCH_OP_N(N, uintN_t, ldr, str, fetch_and, "and", old)		\
309EMIT_FETCH_OP_N(N, uintN_t, ldr, str, fetch_or,  "orr", old)		\
310EMIT_FETCH_OP_N(N, uintN_t, ldr, str, fetch_sub, "sub", old)		\
311EMIT_FETCH_OP_N(N, uintN_t, ldr, str, fetch_xor, "eor", old)		\
312EMIT_FETCH_OP_N(N, uintN_t, ldr, str, add_fetch, "add", new)		\
313EMIT_FETCH_OP_N(N, uintN_t, ldr, str, and_fetch, "and", new)		\
314EMIT_FETCH_OP_N(N, uintN_t, ldr, str, or_fetch,  "orr", new)		\
315EMIT_FETCH_OP_N(N, uintN_t, ldr, str, sub_fetch, "sub", new)		\
316EMIT_FETCH_OP_N(N, uintN_t, ldr, str, xor_fetch, "eor", new)
317
318EMIT_ALL_OPS_N(1, uint8_t, "ldrb", "strb", "strbeq")
319EMIT_ALL_OPS_N(2, uint16_t, "ldrh", "strh", "strheq")
320EMIT_ALL_OPS_N(4, uint32_t, "ldr", "str", "streq")
321#undef	EMIT_ALL_OPS_N
322
323#endif /* _KERNEL */
324
325#endif /* __ARM_ARCH */
326
327#endif /* __CLANG_ATOMICS || __GNUC_ATOMICS */
328
329#if defined(__SYNC_ATOMICS) || defined(EMIT_SYNC_ATOMICS)
330
331#ifdef __clang__
332#pragma redefine_extname __sync_lock_test_and_set_1_c __sync_lock_test_and_set_1
333#pragma redefine_extname __sync_lock_test_and_set_2_c __sync_lock_test_and_set_2
334#pragma	redefine_extname __sync_lock_test_and_set_4_c __sync_lock_test_and_set_4
335#pragma	redefine_extname __sync_val_compare_and_swap_1_c __sync_val_compare_and_swap_1
336#pragma	redefine_extname __sync_val_compare_and_swap_2_c __sync_val_compare_and_swap_2
337#pragma	redefine_extname __sync_val_compare_and_swap_4_c __sync_val_compare_and_swap_4
338#pragma	redefine_extname __sync_fetch_and_add_1_c __sync_fetch_and_add_1
339#pragma	redefine_extname __sync_fetch_and_add_2_c __sync_fetch_and_add_2
340#pragma	redefine_extname __sync_fetch_and_add_4_c __sync_fetch_and_add_4
341#pragma	redefine_extname __sync_fetch_and_and_1_c __sync_fetch_and_and_1
342#pragma	redefine_extname __sync_fetch_and_and_2_c __sync_fetch_and_and_2
343#pragma	redefine_extname __sync_fetch_and_and_4_c __sync_fetch_and_and_4
344#pragma	redefine_extname __sync_fetch_and_or_1_c __sync_fetch_and_or_1
345#pragma	redefine_extname __sync_fetch_and_or_2_c __sync_fetch_and_or_2
346#pragma	redefine_extname __sync_fetch_and_or_4_c __sync_fetch_and_or_4
347#pragma	redefine_extname __sync_fetch_and_xor_1_c __sync_fetch_and_xor_1
348#pragma	redefine_extname __sync_fetch_and_xor_2_c __sync_fetch_and_xor_2
349#pragma	redefine_extname __sync_fetch_and_xor_4_c __sync_fetch_and_xor_4
350#pragma	redefine_extname __sync_fetch_and_sub_1_c __sync_fetch_and_sub_1
351#pragma	redefine_extname __sync_fetch_and_sub_2_c __sync_fetch_and_sub_2
352#pragma	redefine_extname __sync_fetch_and_sub_4_c __sync_fetch_and_sub_4
353#endif
354
355/*
356 * Old __sync_* API.
357 */
358
359#if __ARM_ARCH >= 6
360
361/* Implementations for old GCC versions, lacking support for atomics. */
362
363typedef union {
364	uint8_t		v8[4];
365	uint32_t	v32;
366} reg_t;
367
368/*
369 * Given a memory address pointing to an 8-bit or 16-bit integer, return
370 * the address of the 32-bit word containing it.
371 */
372
373static inline uint32_t *
374round_to_word(void *ptr)
375{
376
377	return ((uint32_t *)((intptr_t)ptr & ~3));
378}
379
380/*
381 * Utility functions for loading and storing 8-bit and 16-bit integers
382 * in 32-bit words at an offset corresponding with the location of the
383 * atomic variable.
384 */
385
386static inline void
387put_1(reg_t *r, const uint8_t *offset_ptr, uint8_t val)
388{
389	size_t offset;
390
391	offset = (intptr_t)offset_ptr & 3;
392	r->v8[offset] = val;
393}
394
395static inline uint8_t
396get_1(const reg_t *r, const uint8_t *offset_ptr)
397{
398	size_t offset;
399
400	offset = (intptr_t)offset_ptr & 3;
401	return (r->v8[offset]);
402}
403
404static inline void
405put_2(reg_t *r, const uint16_t *offset_ptr, uint16_t val)
406{
407	size_t offset;
408	union {
409		uint16_t in;
410		uint8_t out[2];
411	} bytes;
412
413	offset = (intptr_t)offset_ptr & 3;
414	bytes.in = val;
415	r->v8[offset] = bytes.out[0];
416	r->v8[offset + 1] = bytes.out[1];
417}
418
419static inline uint16_t
420get_2(const reg_t *r, const uint16_t *offset_ptr)
421{
422	size_t offset;
423	union {
424		uint8_t in[2];
425		uint16_t out;
426	} bytes;
427
428	offset = (intptr_t)offset_ptr & 3;
429	bytes.in[0] = r->v8[offset];
430	bytes.in[1] = r->v8[offset + 1];
431	return (bytes.out);
432}
433
434/*
435 * 8-bit and 16-bit routines.
436 *
437 * These operations are not natively supported by the CPU, so we use
438 * some shifting and bitmasking on top of the 32-bit instructions.
439 */
440
441#define	EMIT_LOCK_TEST_AND_SET_N(N, uintN_t)				\
442uintN_t									\
443__sync_lock_test_and_set_##N##_c(uintN_t *mem, uintN_t val)			\
444{									\
445	uint32_t *mem32;						\
446	reg_t val32, negmask, old;					\
447	uint32_t temp1, temp2;						\
448									\
449	mem32 = round_to_word(mem);					\
450	val32.v32 = 0x00000000;						\
451	put_##N(&val32, mem, val);					\
452	negmask.v32 = 0xffffffff;					\
453	put_##N(&negmask, mem, 0);					\
454									\
455	do_sync();							\
456	__asm volatile (						\
457		"1:"							\
458		"\tldrex %0, %6\n"	/* Load old value. */		\
459		"\tand   %2, %5, %0\n"	/* Remove the old value. */	\
460		"\torr   %2, %2, %4\n"	/* Put in the new value. */	\
461		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
462		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
463		"\tbne   1b\n"		/* Spin if failed. */		\
464		: "=&r" (old.v32), "=m" (*mem32), "=&r" (temp1),	\
465		  "=&r" (temp2)						\
466		: "r" (val32.v32), "r" (negmask.v32), "m" (*mem32));	\
467	return (get_##N(&old, mem));					\
468}
469
470EMIT_LOCK_TEST_AND_SET_N(1, uint8_t)
471EMIT_LOCK_TEST_AND_SET_N(2, uint16_t)
472
473#define	EMIT_VAL_COMPARE_AND_SWAP_N(N, uintN_t)				\
474uintN_t									\
475__sync_val_compare_and_swap_##N##_c(uintN_t *mem, uintN_t expected,		\
476    uintN_t desired)							\
477{									\
478	uint32_t *mem32;						\
479	reg_t expected32, desired32, posmask, old;			\
480	uint32_t negmask, temp1, temp2;					\
481									\
482	mem32 = round_to_word(mem);					\
483	expected32.v32 = 0x00000000;					\
484	put_##N(&expected32, mem, expected);				\
485	desired32.v32 = 0x00000000;					\
486	put_##N(&desired32, mem, desired);				\
487	posmask.v32 = 0x00000000;					\
488	put_##N(&posmask, mem, ~0);					\
489	negmask = ~posmask.v32;						\
490									\
491	do_sync();							\
492	__asm volatile (						\
493		"1:"							\
494		"\tldrex %0, %8\n"	/* Load old value. */		\
495		"\tand   %2, %6, %0\n"	/* Isolate the old value. */	\
496		"\tcmp   %2, %4\n"	/* Compare to expected value. */\
497		"\tbne   2f\n"		/* Values are unequal. */	\
498		"\tand   %2, %7, %0\n"	/* Remove the old value. */	\
499		"\torr   %2, %5\n"	/* Put in the new value. */	\
500		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
501		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
502		"\tbne   1b\n"		/* Spin if failed. */		\
503		"2:"							\
504		: "=&r" (old), "=m" (*mem32), "=&r" (temp1),		\
505		  "=&r" (temp2)						\
506		: "r" (expected32.v32), "r" (desired32.v32),		\
507		  "r" (posmask.v32), "r" (negmask), "m" (*mem32));	\
508	return (get_##N(&old, mem));					\
509}
510
511EMIT_VAL_COMPARE_AND_SWAP_N(1, uint8_t)
512EMIT_VAL_COMPARE_AND_SWAP_N(2, uint16_t)
513
514#define	EMIT_ARITHMETIC_FETCH_AND_OP_N(N, uintN_t, name, op)		\
515uintN_t									\
516__sync_##name##_##N##_c(uintN_t *mem, uintN_t val)				\
517{									\
518	uint32_t *mem32;						\
519	reg_t val32, posmask, old;					\
520	uint32_t negmask, temp1, temp2;					\
521									\
522	mem32 = round_to_word(mem);					\
523	val32.v32 = 0x00000000;						\
524	put_##N(&val32, mem, val);					\
525	posmask.v32 = 0x00000000;					\
526	put_##N(&posmask, mem, ~0);					\
527	negmask = ~posmask.v32;						\
528									\
529	do_sync();							\
530	__asm volatile (						\
531		"1:"							\
532		"\tldrex %0, %7\n"	/* Load old value. */		\
533		"\t"op"  %2, %0, %4\n"	/* Calculate new value. */	\
534		"\tand   %2, %5\n"	/* Isolate the new value. */	\
535		"\tand   %3, %6, %0\n"	/* Remove the old value. */	\
536		"\torr   %2, %2, %3\n"	/* Put in the new value. */	\
537		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
538		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
539		"\tbne   1b\n"		/* Spin if failed. */		\
540		: "=&r" (old.v32), "=m" (*mem32), "=&r" (temp1),	\
541		  "=&r" (temp2)						\
542		: "r" (val32.v32), "r" (posmask.v32), "r" (negmask),	\
543		  "m" (*mem32));					\
544	return (get_##N(&old, mem));					\
545}
546
547EMIT_ARITHMETIC_FETCH_AND_OP_N(1, uint8_t, fetch_and_add, "add")
548EMIT_ARITHMETIC_FETCH_AND_OP_N(1, uint8_t, fetch_and_sub, "sub")
549EMIT_ARITHMETIC_FETCH_AND_OP_N(2, uint16_t, fetch_and_add, "add")
550EMIT_ARITHMETIC_FETCH_AND_OP_N(2, uint16_t, fetch_and_sub, "sub")
551
552#define	EMIT_BITWISE_FETCH_AND_OP_N(N, uintN_t, name, op, idempotence)	\
553uintN_t									\
554__sync_##name##_##N##_c(uintN_t *mem, uintN_t val)				\
555{									\
556	uint32_t *mem32;						\
557	reg_t val32, old;						\
558	uint32_t temp1, temp2;						\
559									\
560	mem32 = round_to_word(mem);					\
561	val32.v32 = idempotence ? 0xffffffff : 0x00000000;		\
562	put_##N(&val32, mem, val);					\
563									\
564	do_sync();							\
565	__asm volatile (						\
566		"1:"							\
567		"\tldrex %0, %5\n"	/* Load old value. */		\
568		"\t"op"  %2, %4, %0\n"	/* Calculate new value. */	\
569		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
570		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
571		"\tbne   1b\n"		/* Spin if failed. */		\
572		: "=&r" (old.v32), "=m" (*mem32), "=&r" (temp1),	\
573		  "=&r" (temp2)						\
574		: "r" (val32.v32), "m" (*mem32));			\
575	return (get_##N(&old, mem));					\
576}
577
578EMIT_BITWISE_FETCH_AND_OP_N(1, uint8_t, fetch_and_and, "and", 1)
579EMIT_BITWISE_FETCH_AND_OP_N(1, uint8_t, fetch_and_or, "orr", 0)
580EMIT_BITWISE_FETCH_AND_OP_N(1, uint8_t, fetch_and_xor, "eor", 0)
581EMIT_BITWISE_FETCH_AND_OP_N(2, uint16_t, fetch_and_and, "and", 1)
582EMIT_BITWISE_FETCH_AND_OP_N(2, uint16_t, fetch_and_or, "orr", 0)
583EMIT_BITWISE_FETCH_AND_OP_N(2, uint16_t, fetch_and_xor, "eor", 0)
584
585/*
586 * 32-bit routines.
587 */
588
589uint32_t
590__sync_lock_test_and_set_4_c(uint32_t *mem, uint32_t val)
591{
592	uint32_t old, temp;
593
594	do_sync();
595	__asm volatile (
596		"1:"
597		"\tldrex %0, %4\n"	/* Load old value. */
598		"\tstrex %2, %3, %1\n"	/* Attempt to store. */
599		"\tcmp   %2, #0\n"	/* Did it succeed? */
600		"\tbne   1b\n"		/* Spin if failed. */
601		: "=&r" (old), "=m" (*mem), "=&r" (temp)
602		: "r" (val), "m" (*mem));
603	return (old);
604}
605
606uint32_t
607__sync_val_compare_and_swap_4_c(uint32_t *mem, uint32_t expected,
608    uint32_t desired)
609{
610	uint32_t old, temp;
611
612	do_sync();
613	__asm volatile (
614		"1:"
615		"\tldrex %0, %5\n"	/* Load old value. */
616		"\tcmp   %0, %3\n"	/* Compare to expected value. */
617		"\tbne   2f\n"		/* Values are unequal. */
618		"\tstrex %2, %4, %1\n"	/* Attempt to store. */
619		"\tcmp   %2, #0\n"	/* Did it succeed? */
620		"\tbne   1b\n"		/* Spin if failed. */
621		"2:"
622		: "=&r" (old), "=m" (*mem), "=&r" (temp)
623		: "r" (expected), "r" (desired), "m" (*mem));
624	return (old);
625}
626
627#define	EMIT_FETCH_AND_OP_4(name, op)					\
628uint32_t								\
629__sync_##name##_4##_c(uint32_t *mem, uint32_t val)				\
630{									\
631	uint32_t old, temp1, temp2;					\
632									\
633	do_sync();							\
634	__asm volatile (						\
635		"1:"							\
636		"\tldrex %0, %5\n"	/* Load old value. */		\
637		"\t"op"  %2, %0, %4\n"	/* Calculate new value. */	\
638		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
639		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
640		"\tbne   1b\n"		/* Spin if failed. */		\
641		: "=&r" (old), "=m" (*mem), "=&r" (temp1),		\
642		  "=&r" (temp2)						\
643		: "r" (val), "m" (*mem));				\
644	return (old);							\
645}
646
647EMIT_FETCH_AND_OP_4(fetch_and_add, "add")
648EMIT_FETCH_AND_OP_4(fetch_and_and, "and")
649EMIT_FETCH_AND_OP_4(fetch_and_or, "orr")
650EMIT_FETCH_AND_OP_4(fetch_and_sub, "sub")
651EMIT_FETCH_AND_OP_4(fetch_and_xor, "eor")
652
653#ifndef __clang__
654__strong_reference(__sync_lock_test_and_set_1_c, __sync_lock_test_and_set_1);
655__strong_reference(__sync_lock_test_and_set_2_c, __sync_lock_test_and_set_2);
656__strong_reference(__sync_lock_test_and_set_4_c, __sync_lock_test_and_set_4);
657__strong_reference(__sync_val_compare_and_swap_1_c, __sync_val_compare_and_swap_1);
658__strong_reference(__sync_val_compare_and_swap_2_c, __sync_val_compare_and_swap_2);
659__strong_reference(__sync_val_compare_and_swap_4_c, __sync_val_compare_and_swap_4);
660__strong_reference(__sync_fetch_and_add_1_c, __sync_fetch_and_add_1);
661__strong_reference(__sync_fetch_and_add_2_c, __sync_fetch_and_add_2);
662__strong_reference(__sync_fetch_and_add_4_c, __sync_fetch_and_add_4);
663__strong_reference(__sync_fetch_and_and_1_c, __sync_fetch_and_and_1);
664__strong_reference(__sync_fetch_and_and_2_c, __sync_fetch_and_and_2);
665__strong_reference(__sync_fetch_and_and_4_c, __sync_fetch_and_and_4);
666__strong_reference(__sync_fetch_and_sub_1_c, __sync_fetch_and_sub_1);
667__strong_reference(__sync_fetch_and_sub_2_c, __sync_fetch_and_sub_2);
668__strong_reference(__sync_fetch_and_sub_4_c, __sync_fetch_and_sub_4);
669__strong_reference(__sync_fetch_and_or_1_c, __sync_fetch_and_or_1);
670__strong_reference(__sync_fetch_and_or_2_c, __sync_fetch_and_or_2);
671__strong_reference(__sync_fetch_and_or_4_c, __sync_fetch_and_or_4);
672__strong_reference(__sync_fetch_and_xor_1_c, __sync_fetch_and_xor_1);
673__strong_reference(__sync_fetch_and_xor_2_c, __sync_fetch_and_xor_2);
674__strong_reference(__sync_fetch_and_xor_4_c, __sync_fetch_and_xor_4);
675#endif
676
677#else /* __ARM_ARCH < 6 */
678
679#ifdef _KERNEL
680
681#ifdef SMP
682#error "On SMP systems we should have proper atomic operations."
683#endif
684
685/*
686 * On uniprocessor systems, we can perform the atomic operations by
687 * disabling interrupts.
688 */
689
690#define	EMIT_VAL_COMPARE_AND_SWAP_N(N, uintN_t)				\
691uintN_t									\
692__sync_val_compare_and_swap_##N(uintN_t *mem, uintN_t expected,		\
693    uintN_t desired)							\
694{									\
695	uintN_t ret;							\
696									\
697	WITHOUT_INTERRUPTS({						\
698		ret = *mem;						\
699		if (*mem == expected)					\
700			*mem = desired;					\
701	});								\
702	return (ret);							\
703}
704
705#define	EMIT_FETCH_AND_OP_N(N, uintN_t, name, op)			\
706uintN_t									\
707__sync_##name##_##N(uintN_t *mem, uintN_t val)				\
708{									\
709	uintN_t ret;							\
710									\
711	WITHOUT_INTERRUPTS({						\
712		ret = *mem;						\
713		*mem op val;						\
714	});								\
715	return (ret);							\
716}
717
718#define	EMIT_ALL_OPS_N(N, uintN_t)					\
719EMIT_VAL_COMPARE_AND_SWAP_N(N, uintN_t)					\
720EMIT_FETCH_AND_OP_N(N, uintN_t, lock_test_and_set, =)			\
721EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_add, +=)			\
722EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_and, &=)			\
723EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_or, |=)			\
724EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_sub, -=)			\
725EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_xor, ^=)
726
727EMIT_ALL_OPS_N(1, uint8_t)
728EMIT_ALL_OPS_N(2, uint16_t)
729EMIT_ALL_OPS_N(4, uint32_t)
730EMIT_ALL_OPS_N(8, uint64_t)
731#undef	EMIT_ALL_OPS_N
732
733#else /* !_KERNEL */
734
735/*
736 * For userspace on uniprocessor systems, we can implement the atomic
737 * operations by using a Restartable Atomic Sequence. This makes the
738 * kernel restart the code from the beginning when interrupted.
739 */
740
741#define	EMIT_LOCK_TEST_AND_SET_N(N, uintN_t, ldr, str)			\
742uintN_t									\
743__sync_lock_test_and_set_##N##_c(uintN_t *mem, uintN_t val)			\
744{									\
745	uint32_t old, temp, ras_start;					\
746									\
747	ras_start = ARM_RAS_START;					\
748	__asm volatile (						\
749		/* Set up Restartable Atomic Sequence. */		\
750		"1:"							\
751		"\tadr   %2, 1b\n"					\
752		"\tstr   %2, [%5]\n"					\
753		"\tadr   %2, 2f\n"					\
754		"\tstr   %2, [%5, #4]\n"				\
755									\
756		"\t"ldr" %0, %4\n"	/* Load old value. */		\
757		"\t"str" %3, %1\n"	/* Store new value. */		\
758									\
759		/* Tear down Restartable Atomic Sequence. */		\
760		"2:"							\
761		"\tmov   %2, #0x00000000\n"				\
762		"\tstr   %2, [%5]\n"					\
763		"\tmov   %2, #0xffffffff\n"				\
764		"\tstr   %2, [%5, #4]\n"				\
765		: "=&r" (old), "=m" (*mem), "=&r" (temp)		\
766		: "r" (val), "m" (*mem), "r" (ras_start));		\
767	return (old);							\
768}
769
770#define	EMIT_VAL_COMPARE_AND_SWAP_N(N, uintN_t, ldr, streq)		\
771uintN_t									\
772__sync_val_compare_and_swap_##N##_c(uintN_t *mem, uintN_t expected,		\
773    uintN_t desired)							\
774{									\
775	uint32_t old, temp, ras_start;					\
776									\
777	ras_start = ARM_RAS_START;					\
778	__asm volatile (						\
779		/* Set up Restartable Atomic Sequence. */		\
780		"1:"							\
781		"\tadr   %2, 1b\n"					\
782		"\tstr   %2, [%6]\n"					\
783		"\tadr   %2, 2f\n"					\
784		"\tstr   %2, [%6, #4]\n"				\
785									\
786		"\t"ldr" %0, %5\n"	/* Load old value. */		\
787		"\tcmp   %0, %3\n"	/* Compare to expected value. */\
788		"\t"streq" %4, %1\n"	/* Store new value. */		\
789									\
790		/* Tear down Restartable Atomic Sequence. */		\
791		"2:"							\
792		"\tmov   %2, #0x00000000\n"				\
793		"\tstr   %2, [%6]\n"					\
794		"\tmov   %2, #0xffffffff\n"				\
795		"\tstr   %2, [%6, #4]\n"				\
796		: "=&r" (old), "=m" (*mem), "=&r" (temp)		\
797		: "r" (expected), "r" (desired), "m" (*mem),		\
798		  "r" (ras_start));					\
799	return (old);							\
800}
801
802#define	EMIT_FETCH_AND_OP_N(N, uintN_t, ldr, str, name, op)		\
803uintN_t									\
804__sync_##name##_##N##_c(uintN_t *mem, uintN_t val)				\
805{									\
806	uint32_t old, temp, ras_start;					\
807									\
808	ras_start = ARM_RAS_START;					\
809	__asm volatile (						\
810		/* Set up Restartable Atomic Sequence. */		\
811		"1:"							\
812		"\tadr   %2, 1b\n"					\
813		"\tstr   %2, [%5]\n"					\
814		"\tadr   %2, 2f\n"					\
815		"\tstr   %2, [%5, #4]\n"				\
816									\
817		"\t"ldr" %0, %4\n"	/* Load old value. */		\
818		"\t"op"  %2, %0, %3\n"	/* Calculate new value. */	\
819		"\t"str" %2, %1\n"	/* Store new value. */		\
820									\
821		/* Tear down Restartable Atomic Sequence. */		\
822		"2:"							\
823		"\tmov   %2, #0x00000000\n"				\
824		"\tstr   %2, [%5]\n"					\
825		"\tmov   %2, #0xffffffff\n"				\
826		"\tstr   %2, [%5, #4]\n"				\
827		: "=&r" (old), "=m" (*mem), "=&r" (temp)		\
828		: "r" (val), "m" (*mem), "r" (ras_start));		\
829	return (old);							\
830}
831
832#define	EMIT_ALL_OPS_N(N, uintN_t, ldr, str, streq)			\
833EMIT_LOCK_TEST_AND_SET_N(N, uintN_t, ldr, str)				\
834EMIT_VAL_COMPARE_AND_SWAP_N(N, uintN_t, ldr, streq)			\
835EMIT_FETCH_AND_OP_N(N, uintN_t, ldr, str, fetch_and_add, "add")		\
836EMIT_FETCH_AND_OP_N(N, uintN_t, ldr, str, fetch_and_and, "and")		\
837EMIT_FETCH_AND_OP_N(N, uintN_t, ldr, str, fetch_and_or, "orr")		\
838EMIT_FETCH_AND_OP_N(N, uintN_t, ldr, str, fetch_and_sub, "sub")		\
839EMIT_FETCH_AND_OP_N(N, uintN_t, ldr, str, fetch_and_xor, "eor")
840
841#ifdef __clang__
842EMIT_ALL_OPS_N(1, uint8_t, "ldrb", "strb", "strbeq")
843EMIT_ALL_OPS_N(2, uint16_t, "ldrh", "strh", "strheq")
844#else
845EMIT_ALL_OPS_N(1, uint8_t, "ldrb", "strb", "streqb")
846EMIT_ALL_OPS_N(2, uint16_t, "ldrh", "strh", "streqh")
847#endif
848EMIT_ALL_OPS_N(4, uint32_t, "ldr", "str", "streq")
849
850#ifndef __clang__
851__strong_reference(__sync_lock_test_and_set_1_c, __sync_lock_test_and_set_1);
852__strong_reference(__sync_lock_test_and_set_2_c, __sync_lock_test_and_set_2);
853__strong_reference(__sync_lock_test_and_set_4_c, __sync_lock_test_and_set_4);
854__strong_reference(__sync_val_compare_and_swap_1_c, __sync_val_compare_and_swap_1);
855__strong_reference(__sync_val_compare_and_swap_2_c, __sync_val_compare_and_swap_2);
856__strong_reference(__sync_val_compare_and_swap_4_c, __sync_val_compare_and_swap_4);
857__strong_reference(__sync_fetch_and_add_1_c, __sync_fetch_and_add_1);
858__strong_reference(__sync_fetch_and_add_2_c, __sync_fetch_and_add_2);
859__strong_reference(__sync_fetch_and_add_4_c, __sync_fetch_and_add_4);
860__strong_reference(__sync_fetch_and_and_1_c, __sync_fetch_and_and_1);
861__strong_reference(__sync_fetch_and_and_2_c, __sync_fetch_and_and_2);
862__strong_reference(__sync_fetch_and_and_4_c, __sync_fetch_and_and_4);
863__strong_reference(__sync_fetch_and_sub_1_c, __sync_fetch_and_sub_1);
864__strong_reference(__sync_fetch_and_sub_2_c, __sync_fetch_and_sub_2);
865__strong_reference(__sync_fetch_and_sub_4_c, __sync_fetch_and_sub_4);
866__strong_reference(__sync_fetch_and_or_1_c, __sync_fetch_and_or_1);
867__strong_reference(__sync_fetch_and_or_2_c, __sync_fetch_and_or_2);
868__strong_reference(__sync_fetch_and_or_4_c, __sync_fetch_and_or_4);
869__strong_reference(__sync_fetch_and_xor_1_c, __sync_fetch_and_xor_1);
870__strong_reference(__sync_fetch_and_xor_2_c, __sync_fetch_and_xor_2);
871__strong_reference(__sync_fetch_and_xor_4_c, __sync_fetch_and_xor_4);
872#endif /* __ARM_ARCH */
873
874#endif /* _KERNEL */
875
876#endif
877
878#endif /* __SYNC_ATOMICS */
879