1#include <aio.h>
2#include <pthread.h>
3#include <semaphore.h>
4#include <limits.h>
5#include <errno.h>
6#include <unistd.h>
7#include <stdlib.h>
8#include "syscall.h"
9#include "atomic.h"
10#include "libc.h"
11#include "pthread_impl.h"
12
13/* The following is a threads-based implementation of AIO with minimal
14 * dependence on implementation details. Most synchronization is
15 * performed with pthread primitives, but atomics and futex operations
16 * are used for notification in a couple places where the pthread
17 * primitives would be inefficient or impractical.
18 *
19 * For each fd with outstanding aio operations, an aio_queue structure
20 * is maintained. These are reference-counted and destroyed by the last
21 * aio worker thread to exit. Accessing any member of the aio_queue
22 * structure requires a lock on the aio_queue. Adding and removing aio
23 * queues themselves requires a write lock on the global map object,
24 * a 4-level table mapping file descriptor numbers to aio queues. A
25 * read lock on the map is used to obtain locks on existing queues by
26 * excluding destruction of the queue by a different thread while it is
27 * being locked.
28 *
29 * Each aio queue has a list of active threads/operations. Presently there
30 * is a one to one relationship between threads and operations. The only
31 * members of the aio_thread structure which are accessed by other threads
32 * are the linked list pointers, op (which is immutable), running (which
33 * is updated atomically), and err (which is synchronized via running),
34 * so no locking is necessary. Most of the other other members are used
35 * for sharing data between the main flow of execution and cancellation
36 * cleanup handler.
37 *
38 * Taking any aio locks requires having all signals blocked. This is
39 * necessary because aio_cancel is needed by close, and close is required
40 * to be async-signal safe. All aio worker threads run with all signals
41 * blocked permanently.
42 */
43
44struct aio_args {
45	struct aiocb *cb;
46	int op;
47	int err;
48	sem_t sem;
49};
50
51struct aio_thread {
52	pthread_t td;
53	struct aiocb *cb;
54	struct aio_thread *next, *prev;
55	struct aio_queue *q;
56	volatile int running;
57	int err, op;
58	ssize_t ret;
59};
60
61struct aio_queue {
62	int fd, seekable, append, ref, init;
63	pthread_mutex_t lock;
64	pthread_cond_t cond;
65	struct aio_thread *head;
66};
67
68static pthread_rwlock_t maplock = PTHREAD_RWLOCK_INITIALIZER;
69static struct aio_queue *****map;
70static volatile int aio_fd_cnt;
71volatile int __aio_fut;
72
73static struct aio_queue *__aio_get_queue(int fd, int need)
74{
75	if (fd < 0) return 0;
76	int a=fd>>24;
77	unsigned char b=fd>>16, c=fd>>8, d=fd;
78	struct aio_queue *q = 0;
79	pthread_rwlock_rdlock(&maplock);
80	if ((!map || !map[a] || !map[a][b] || !map[a][b][c] || !(q=map[a][b][c][d])) && need) {
81		pthread_rwlock_unlock(&maplock);
82		pthread_rwlock_wrlock(&maplock);
83		if (!map) map = calloc(sizeof *map, (-1U/2+1)>>24);
84		if (!map) goto out;
85		if (!map[a]) map[a] = calloc(sizeof **map, 256);
86		if (!map[a]) goto out;
87		if (!map[a][b]) map[a][b] = calloc(sizeof ***map, 256);
88		if (!map[a][b]) goto out;
89		if (!map[a][b][c]) map[a][b][c] = calloc(sizeof ****map, 256);
90		if (!map[a][b][c]) goto out;
91		if (!(q = map[a][b][c][d])) {
92			map[a][b][c][d] = q = calloc(sizeof *****map, 1);
93			if (q) {
94				q->fd = fd;
95				pthread_mutex_init(&q->lock, 0);
96				pthread_cond_init(&q->cond, 0);
97				a_inc(&aio_fd_cnt);
98			}
99		}
100	}
101	if (q) pthread_mutex_lock(&q->lock);
102out:
103	pthread_rwlock_unlock(&maplock);
104	return q;
105}
106
107static void __aio_unref_queue(struct aio_queue *q)
108{
109	if (q->ref > 1) {
110		q->ref--;
111		pthread_mutex_unlock(&q->lock);
112		return;
113	}
114
115	/* This is potentially the last reference, but a new reference
116	 * may arrive since we cannot free the queue object without first
117	 * taking the maplock, which requires releasing the queue lock. */
118	pthread_mutex_unlock(&q->lock);
119	pthread_rwlock_wrlock(&maplock);
120	pthread_mutex_lock(&q->lock);
121	if (q->ref == 1) {
122		int fd=q->fd;
123		int a=fd>>24;
124		unsigned char b=fd>>16, c=fd>>8, d=fd;
125		map[a][b][c][d] = 0;
126		a_dec(&aio_fd_cnt);
127		pthread_rwlock_unlock(&maplock);
128		pthread_mutex_unlock(&q->lock);
129		free(q);
130	} else {
131		q->ref--;
132		pthread_rwlock_unlock(&maplock);
133		pthread_mutex_unlock(&q->lock);
134	}
135}
136
137static void cleanup(void *ctx)
138{
139	struct aio_thread *at = ctx;
140	struct aio_queue *q = at->q;
141	struct aiocb *cb = at->cb;
142	struct sigevent sev = cb->aio_sigevent;
143
144	/* There are four potential types of waiters we could need to wake:
145	 *   1. Callers of aio_cancel/close.
146	 *   2. Callers of aio_suspend with a single aiocb.
147	 *   3. Callers of aio_suspend with a list.
148	 *   4. AIO worker threads waiting for sequenced operations.
149	 * Types 1-3 are notified via atomics/futexes, mainly for AS-safety
150	 * considerations. Type 4 is notified later via a cond var. */
151
152	cb->__ret = at->ret;
153	if (a_swap(&at->running, 0) < 0)
154		__wake(&at->running, -1, 1);
155	if (a_swap(&cb->__err, at->err) != EINPROGRESS)
156		__wake(&cb->__err, -1, 1);
157	if (a_swap(&__aio_fut, 0))
158		__wake(&__aio_fut, -1, 1);
159
160	pthread_mutex_lock(&q->lock);
161
162	if (at->next) at->next->prev = at->prev;
163	if (at->prev) at->prev->next = at->next;
164	else q->head = at->next;
165
166	/* Signal aio worker threads waiting for sequenced operations. */
167	pthread_cond_broadcast(&q->cond);
168
169	__aio_unref_queue(q);
170
171	if (sev.sigev_notify == SIGEV_SIGNAL) {
172		siginfo_t si = {
173			.si_signo = sev.sigev_signo,
174			.si_value = sev.sigev_value,
175			.si_code = SI_ASYNCIO,
176			.si_pid = getpid(),
177			.si_uid = getuid()
178		};
179		__syscall(SYS_rt_sigqueueinfo, si.si_pid, si.si_signo, &si);
180	}
181	if (sev.sigev_notify == SIGEV_THREAD) {
182		a_store(&__pthread_self()->cancel, 0);
183		sev.sigev_notify_function(sev.sigev_value);
184	}
185}
186
187static void *io_thread_func(void *ctx)
188{
189	struct aio_thread at, *p;
190
191	struct aio_args *args = ctx;
192	struct aiocb *cb = args->cb;
193	int fd = cb->aio_fildes;
194	int op = args->op;
195	void *buf = (void *)cb->aio_buf;
196	size_t len = cb->aio_nbytes;
197	off_t off = cb->aio_offset;
198
199	struct aio_queue *q = __aio_get_queue(fd, 1);
200	ssize_t ret;
201
202	args->err = q ? 0 : EAGAIN;
203	sem_post(&args->sem);
204	if (!q) return 0;
205
206	at.op = op;
207	at.running = 1;
208	at.ret = -1;
209	at.err = ECANCELED;
210	at.q = q;
211	at.td = __pthread_self();
212	at.cb = cb;
213	at.prev = 0;
214	if ((at.next = q->head)) at.next->prev = &at;
215	q->head = &at;
216	q->ref++;
217
218	if (!q->init) {
219		int seekable = lseek(fd, 0, SEEK_CUR) >= 0;
220		q->seekable = seekable;
221		q->append = !seekable || (fcntl(fd, F_GETFL) & O_APPEND);
222		q->init = 1;
223	}
224
225	pthread_cleanup_push(cleanup, &at);
226
227	/* Wait for sequenced operations. */
228	if (op!=LIO_READ && (op!=LIO_WRITE || q->append)) {
229		for (;;) {
230			for (p=at.next; p && p->op!=LIO_WRITE; p=p->next);
231			if (!p) break;
232			pthread_cond_wait(&q->cond, &q->lock);
233		}
234	}
235
236	pthread_mutex_unlock(&q->lock);
237
238	switch (op) {
239	case LIO_WRITE:
240		ret = q->append ? write(fd, buf, len) : pwrite(fd, buf, len, off);
241		break;
242	case LIO_READ:
243		ret = !q->seekable ? read(fd, buf, len) : pread(fd, buf, len, off);
244		break;
245	case O_SYNC:
246		ret = fsync(fd);
247		break;
248	case O_DSYNC:
249		ret = fdatasync(fd);
250		break;
251	}
252	at.ret = ret;
253	at.err = ret<0 ? errno : 0;
254
255	pthread_cleanup_pop(1);
256
257	return 0;
258}
259
260static int submit(struct aiocb *cb, int op)
261{
262	int ret = 0;
263	pthread_attr_t a;
264	sigset_t allmask, origmask;
265	pthread_t td;
266	struct aio_args args = { .cb = cb, .op = op };
267	sem_init(&args.sem, 0, 0);
268
269	if (cb->aio_sigevent.sigev_notify == SIGEV_THREAD) {
270		if (cb->aio_sigevent.sigev_notify_attributes)
271			a = *cb->aio_sigevent.sigev_notify_attributes;
272		else
273			pthread_attr_init(&a);
274	} else {
275		pthread_attr_init(&a);
276		pthread_attr_setstacksize(&a, PTHREAD_STACK_MIN);
277		pthread_attr_setguardsize(&a, 0);
278	}
279	pthread_attr_setdetachstate(&a, PTHREAD_CREATE_DETACHED);
280	sigfillset(&allmask);
281	pthread_sigmask(SIG_BLOCK, &allmask, &origmask);
282	cb->__err = EINPROGRESS;
283	if (pthread_create(&td, &a, io_thread_func, &args)) {
284		errno = EAGAIN;
285		ret = -1;
286	}
287	pthread_sigmask(SIG_SETMASK, &origmask, 0);
288
289	if (!ret) {
290		while (sem_wait(&args.sem));
291		if (args.err) {
292			errno = args.err;
293			ret = -1;
294		}
295	}
296
297	return ret;
298}
299
300int aio_read(struct aiocb *cb)
301{
302	return submit(cb, LIO_READ);
303}
304
305int aio_write(struct aiocb *cb)
306{
307	return submit(cb, LIO_WRITE);
308}
309
310int aio_fsync(int op, struct aiocb *cb)
311{
312	if (op != O_SYNC && op != O_DSYNC) {
313		errno = EINVAL;
314		return -1;
315	}
316	return submit(cb, op);
317}
318
319ssize_t aio_return(struct aiocb *cb)
320{
321	return cb->__ret;
322}
323
324int aio_error(const struct aiocb *cb)
325{
326	a_barrier();
327	return cb->__err & 0x7fffffff;
328}
329
330int aio_cancel(int fd, struct aiocb *cb)
331{
332	sigset_t allmask, origmask;
333	int ret = AIO_ALLDONE;
334	struct aio_thread *p;
335	struct aio_queue *q;
336
337	/* Unspecified behavior case. Report an error. */
338	if (cb && fd != cb->aio_fildes) {
339		errno = EINVAL;
340		return -1;
341	}
342
343	sigfillset(&allmask);
344	pthread_sigmask(SIG_BLOCK, &allmask, &origmask);
345
346	if (!(q = __aio_get_queue(fd, 0))) {
347		if (fcntl(fd, F_GETFD) < 0) ret = -1;
348		goto done;
349	}
350
351	for (p = q->head; p; p = p->next) {
352		if (cb && cb != p->cb) continue;
353		/* Transition target from running to running-with-waiters */
354		if (a_cas(&p->running, 1, -1)) {
355			pthread_cancel(p->td);
356			__wait(&p->running, 0, -1, 1);
357			if (p->err == ECANCELED) ret = AIO_CANCELED;
358		}
359	}
360
361	pthread_mutex_unlock(&q->lock);
362done:
363	pthread_sigmask(SIG_SETMASK, &origmask, 0);
364	return ret;
365}
366
367int __aio_close(int fd)
368{
369	a_barrier();
370	if (aio_fd_cnt) aio_cancel(fd, 0);
371	return fd;
372}
373
374LFS64(aio_cancel);
375LFS64(aio_error);
376LFS64(aio_fsync);
377LFS64(aio_read);
378LFS64(aio_write);
379LFS64(aio_return);
380