1/*
2 * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com)
3 * Licensed under the GPL
4 */
5
6#include <stdlib.h>
7#include <unistd.h>
8#include <signal.h>
9#include <errno.h>
10#include <sched.h>
11#include <sys/syscall.h>
12#include "os.h"
13#include "aio.h"
14#include "init.h"
15#include "user.h"
16#include "mode.h"
17
18struct aio_thread_req {
19	enum aio_type type;
20	int io_fd;
21	unsigned long long offset;
22	char *buf;
23	int len;
24	struct aio_context *aio;
25};
26
27#if defined(HAVE_AIO_ABI)
28#include <linux/aio_abi.h>
29
30/* If we have the headers, we are going to build with AIO enabled.
31 * If we don't have aio in libc, we define the necessary stubs here.
32 */
33
34#if !defined(HAVE_AIO_LIBC)
35
36static long io_setup(int n, aio_context_t *ctxp)
37{
38	return syscall(__NR_io_setup, n, ctxp);
39}
40
41static long io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp)
42{
43	return syscall(__NR_io_submit, ctx, nr, iocbpp);
44}
45
46static long io_getevents(aio_context_t ctx_id, long min_nr, long nr,
47			 struct io_event *events, struct timespec *timeout)
48{
49	return syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, timeout);
50}
51
52#endif
53
54/* The AIO_MMAP cases force the mmapped page into memory here
55 * rather than in whatever place first touches the data.  I used
56 * to do this by touching the page, but that's delicate because
57 * gcc is prone to optimizing that away.  So, what's done here
58 * is we read from the descriptor from which the page was
59 * mapped.  The caller is required to pass an offset which is
60 * inside the page that was mapped.  Thus, when the read
61 * returns, we know that the page is in the page cache, and
62 * that it now backs the mmapped area.
63 */
64
65static int do_aio(aio_context_t ctx, enum aio_type type, int fd, char *buf,
66		  int len, unsigned long long offset, struct aio_context *aio)
67{
68	struct iocb iocb, *iocbp = &iocb;
69	char c;
70	int err;
71
72	iocb = ((struct iocb) { .aio_data 	= (unsigned long) aio,
73				.aio_reqprio	= 0,
74				.aio_fildes	= fd,
75				.aio_buf	= (unsigned long) buf,
76				.aio_nbytes	= len,
77				.aio_offset	= offset,
78				.aio_reserved1	= 0,
79				.aio_reserved2	= 0,
80				.aio_reserved3	= 0 });
81
82	switch(type){
83	case AIO_READ:
84		iocb.aio_lio_opcode = IOCB_CMD_PREAD;
85		err = io_submit(ctx, 1, &iocbp);
86		break;
87	case AIO_WRITE:
88		iocb.aio_lio_opcode = IOCB_CMD_PWRITE;
89		err = io_submit(ctx, 1, &iocbp);
90		break;
91	case AIO_MMAP:
92		iocb.aio_lio_opcode = IOCB_CMD_PREAD;
93		iocb.aio_buf = (unsigned long) &c;
94		iocb.aio_nbytes = sizeof(c);
95		err = io_submit(ctx, 1, &iocbp);
96		break;
97	default:
98		printk("Bogus op in do_aio - %d\n", type);
99		err = -EINVAL;
100		break;
101	}
102
103	if(err > 0)
104		err = 0;
105	else
106		err = -errno;
107
108	return err;
109}
110
111/* Initialized in an initcall and unchanged thereafter */
112static aio_context_t ctx = 0;
113
114static int aio_thread(void *arg)
115{
116	struct aio_thread_reply reply;
117	struct io_event event;
118	int err, n, reply_fd;
119
120	signal(SIGWINCH, SIG_IGN);
121
122	while(1){
123		n = io_getevents(ctx, 1, 1, &event, NULL);
124		if(n < 0){
125			if(errno == EINTR)
126				continue;
127			printk("aio_thread - io_getevents failed, "
128			       "errno = %d\n", errno);
129		}
130		else {
131			reply = ((struct aio_thread_reply)
132				{ .data = (void *) (long) event.data,
133						.err	= event.res });
134			reply_fd = ((struct aio_context *) reply.data)->reply_fd;
135			err = write(reply_fd, &reply, sizeof(reply));
136			if(err != sizeof(reply))
137				printk("aio_thread - write failed, fd = %d, "
138				       "err = %d\n", reply_fd, errno);
139		}
140	}
141	return 0;
142}
143
144#endif
145
146static int do_not_aio(struct aio_thread_req *req)
147{
148	char c;
149	unsigned long long actual;
150	int n;
151
152	actual = lseek64(req->io_fd, req->offset, SEEK_SET);
153	if(actual != req->offset)
154		return -errno;
155
156	switch(req->type){
157	case AIO_READ:
158		n = read(req->io_fd, req->buf, req->len);
159		break;
160	case AIO_WRITE:
161		n = write(req->io_fd, req->buf, req->len);
162		break;
163	case AIO_MMAP:
164		n = read(req->io_fd, &c, sizeof(c));
165		break;
166	default:
167		printk("do_not_aio - bad request type : %d\n", req->type);
168		return -EINVAL;
169	}
170
171	if(n < 0)
172		return -errno;
173	return 0;
174}
175
176/* These are initialized in initcalls and not changed */
177static int aio_req_fd_r = -1;
178static int aio_req_fd_w = -1;
179static int aio_pid = -1;
180
181static int not_aio_thread(void *arg)
182{
183	struct aio_thread_req req;
184	struct aio_thread_reply reply;
185	int err;
186
187	signal(SIGWINCH, SIG_IGN);
188	while(1){
189		err = read(aio_req_fd_r, &req, sizeof(req));
190		if(err != sizeof(req)){
191			if(err < 0)
192				printk("not_aio_thread - read failed, "
193				       "fd = %d, err = %d\n", aio_req_fd_r,
194				       errno);
195			else {
196				printk("not_aio_thread - short read, fd = %d, "
197				       "length = %d\n", aio_req_fd_r, err);
198			}
199			continue;
200		}
201		err = do_not_aio(&req);
202		reply = ((struct aio_thread_reply) { .data 	= req.aio,
203						     .err	= err });
204		err = write(req.aio->reply_fd, &reply, sizeof(reply));
205		if(err != sizeof(reply))
206			printk("not_aio_thread - write failed, fd = %d, "
207			       "err = %d\n", req.aio->reply_fd, errno);
208	}
209
210	return 0;
211}
212
213static int init_aio_24(void)
214{
215	unsigned long stack;
216	int fds[2], err;
217
218	err = os_pipe(fds, 1, 1);
219	if(err)
220		goto out;
221
222	aio_req_fd_w = fds[0];
223	aio_req_fd_r = fds[1];
224
225	err = os_set_fd_block(aio_req_fd_w, 0);
226	if(err)
227		goto out_close_pipe;
228
229	err = run_helper_thread(not_aio_thread, NULL,
230				CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0);
231	if(err < 0)
232		goto out_close_pipe;
233
234	aio_pid = err;
235	goto out;
236
237out_close_pipe:
238	os_close_file(fds[0]);
239	os_close_file(fds[1]);
240	aio_req_fd_w = -1;
241	aio_req_fd_r = -1;
242out:
243#ifndef HAVE_AIO_ABI
244	printk("/usr/include/linux/aio_abi.h not present during build\n");
245#endif
246	printk("2.6 host AIO support not used - falling back to I/O "
247	       "thread\n");
248	return 0;
249}
250
251#ifdef HAVE_AIO_ABI
252#define DEFAULT_24_AIO 0
253static int init_aio_26(void)
254{
255	unsigned long stack;
256	int err;
257
258	if(io_setup(256, &ctx)){
259		err = -errno;
260		printk("aio_thread failed to initialize context, err = %d\n",
261		       errno);
262		return err;
263	}
264
265	err = run_helper_thread(aio_thread, NULL,
266				CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0);
267	if(err < 0)
268		return err;
269
270	aio_pid = err;
271
272	printk("Using 2.6 host AIO\n");
273	return 0;
274}
275
276static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len,
277			 unsigned long long offset, struct aio_context *aio)
278{
279	struct aio_thread_reply reply;
280	int err;
281
282	err = do_aio(ctx, type, io_fd, buf, len, offset, aio);
283	if(err){
284		reply = ((struct aio_thread_reply) { .data = aio,
285					 .err  = err });
286		err = write(aio->reply_fd, &reply, sizeof(reply));
287		if(err != sizeof(reply)){
288			err = -errno;
289			printk("submit_aio_26 - write failed, "
290			       "fd = %d, err = %d\n", aio->reply_fd, -err);
291		}
292		else err = 0;
293	}
294
295	return err;
296}
297
298#else
299#define DEFAULT_24_AIO 1
300static int init_aio_26(void)
301{
302	return -ENOSYS;
303}
304
305static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len,
306			 unsigned long long offset, struct aio_context *aio)
307{
308	return -ENOSYS;
309}
310#endif
311
312/* Initialized in an initcall and unchanged thereafter */
313static int aio_24 = DEFAULT_24_AIO;
314
315static int __init set_aio_24(char *name, int *add)
316{
317	aio_24 = 1;
318	return 0;
319}
320
321__uml_setup("aio=2.4", set_aio_24,
322"aio=2.4\n"
323"    This is used to force UML to use 2.4-style AIO even when 2.6 AIO is\n"
324"    available.  2.4 AIO is a single thread that handles one request at a\n"
325"    time, synchronously.  2.6 AIO is a thread which uses the 2.6 AIO \n"
326"    interface to handle an arbitrary number of pending requests.  2.6 AIO \n"
327"    is not available in tt mode, on 2.4 hosts, or when UML is built with\n"
328"    /usr/include/linux/aio_abi.h not available.  Many distributions don't\n"
329"    include aio_abi.h, so you will need to copy it from a kernel tree to\n"
330"    your /usr/include/linux in order to build an AIO-capable UML\n\n"
331);
332
333static int init_aio(void)
334{
335	int err;
336
337	CHOOSE_MODE(({ if(!aio_24){
338			    printk("Disabling 2.6 AIO in tt mode\n");
339			    aio_24 = 1;
340		    } }), (void) 0);
341
342	if(!aio_24){
343		err = init_aio_26();
344		if(err && (errno == ENOSYS)){
345			printk("2.6 AIO not supported on the host - "
346			       "reverting to 2.4 AIO\n");
347			aio_24 = 1;
348		}
349		else return err;
350	}
351
352	if(aio_24)
353		return init_aio_24();
354
355	return 0;
356}
357
358/* The reason for the __initcall/__uml_exitcall asymmetry is that init_aio
359 * needs to be called when the kernel is running because it calls run_helper,
360 * which needs get_free_page.  exit_aio is a __uml_exitcall because the generic
361 * kernel does not run __exitcalls on shutdown, and can't because many of them
362 * break when called outside of module unloading.
363 */
364__initcall(init_aio);
365
366static void exit_aio(void)
367{
368	if(aio_pid != -1)
369		os_kill_process(aio_pid, 1);
370}
371
372__uml_exitcall(exit_aio);
373
374static int submit_aio_24(enum aio_type type, int io_fd, char *buf, int len,
375			 unsigned long long offset, struct aio_context *aio)
376{
377	struct aio_thread_req req = { .type 		= type,
378				      .io_fd		= io_fd,
379				      .offset		= offset,
380				      .buf		= buf,
381				      .len		= len,
382				      .aio		= aio,
383	};
384	int err;
385
386	err = write(aio_req_fd_w, &req, sizeof(req));
387	if(err == sizeof(req))
388		err = 0;
389	else err = -errno;
390
391	return err;
392}
393
394int submit_aio(enum aio_type type, int io_fd, char *buf, int len,
395	       unsigned long long offset, int reply_fd,
396	       struct aio_context *aio)
397{
398	aio->reply_fd = reply_fd;
399	if(aio_24)
400		return submit_aio_24(type, io_fd, buf, len, offset, aio);
401	else {
402		return submit_aio_26(type, io_fd, buf, len, offset, aio);
403	}
404}
405