1/* 2 * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com) 3 * Licensed under the GPL 4 */ 5 6#include <stdlib.h> 7#include <unistd.h> 8#include <signal.h> 9#include <errno.h> 10#include <sched.h> 11#include <sys/syscall.h> 12#include "os.h" 13#include "aio.h" 14#include "init.h" 15#include "user.h" 16#include "mode.h" 17 18struct aio_thread_req { 19 enum aio_type type; 20 int io_fd; 21 unsigned long long offset; 22 char *buf; 23 int len; 24 struct aio_context *aio; 25}; 26 27#if defined(HAVE_AIO_ABI) 28#include <linux/aio_abi.h> 29 30/* If we have the headers, we are going to build with AIO enabled. 31 * If we don't have aio in libc, we define the necessary stubs here. 32 */ 33 34#if !defined(HAVE_AIO_LIBC) 35 36static long io_setup(int n, aio_context_t *ctxp) 37{ 38 return syscall(__NR_io_setup, n, ctxp); 39} 40 41static long io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp) 42{ 43 return syscall(__NR_io_submit, ctx, nr, iocbpp); 44} 45 46static long io_getevents(aio_context_t ctx_id, long min_nr, long nr, 47 struct io_event *events, struct timespec *timeout) 48{ 49 return syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, timeout); 50} 51 52#endif 53 54/* The AIO_MMAP cases force the mmapped page into memory here 55 * rather than in whatever place first touches the data. I used 56 * to do this by touching the page, but that's delicate because 57 * gcc is prone to optimizing that away. So, what's done here 58 * is we read from the descriptor from which the page was 59 * mapped. The caller is required to pass an offset which is 60 * inside the page that was mapped. Thus, when the read 61 * returns, we know that the page is in the page cache, and 62 * that it now backs the mmapped area. 63 */ 64 65static int do_aio(aio_context_t ctx, enum aio_type type, int fd, char *buf, 66 int len, unsigned long long offset, struct aio_context *aio) 67{ 68 struct iocb iocb, *iocbp = &iocb; 69 char c; 70 int err; 71 72 iocb = ((struct iocb) { .aio_data = (unsigned long) aio, 73 .aio_reqprio = 0, 74 .aio_fildes = fd, 75 .aio_buf = (unsigned long) buf, 76 .aio_nbytes = len, 77 .aio_offset = offset, 78 .aio_reserved1 = 0, 79 .aio_reserved2 = 0, 80 .aio_reserved3 = 0 }); 81 82 switch(type){ 83 case AIO_READ: 84 iocb.aio_lio_opcode = IOCB_CMD_PREAD; 85 err = io_submit(ctx, 1, &iocbp); 86 break; 87 case AIO_WRITE: 88 iocb.aio_lio_opcode = IOCB_CMD_PWRITE; 89 err = io_submit(ctx, 1, &iocbp); 90 break; 91 case AIO_MMAP: 92 iocb.aio_lio_opcode = IOCB_CMD_PREAD; 93 iocb.aio_buf = (unsigned long) &c; 94 iocb.aio_nbytes = sizeof(c); 95 err = io_submit(ctx, 1, &iocbp); 96 break; 97 default: 98 printk("Bogus op in do_aio - %d\n", type); 99 err = -EINVAL; 100 break; 101 } 102 103 if(err > 0) 104 err = 0; 105 else 106 err = -errno; 107 108 return err; 109} 110 111/* Initialized in an initcall and unchanged thereafter */ 112static aio_context_t ctx = 0; 113 114static int aio_thread(void *arg) 115{ 116 struct aio_thread_reply reply; 117 struct io_event event; 118 int err, n, reply_fd; 119 120 signal(SIGWINCH, SIG_IGN); 121 122 while(1){ 123 n = io_getevents(ctx, 1, 1, &event, NULL); 124 if(n < 0){ 125 if(errno == EINTR) 126 continue; 127 printk("aio_thread - io_getevents failed, " 128 "errno = %d\n", errno); 129 } 130 else { 131 reply = ((struct aio_thread_reply) 132 { .data = (void *) (long) event.data, 133 .err = event.res }); 134 reply_fd = ((struct aio_context *) reply.data)->reply_fd; 135 err = write(reply_fd, &reply, sizeof(reply)); 136 if(err != sizeof(reply)) 137 printk("aio_thread - write failed, fd = %d, " 138 "err = %d\n", reply_fd, errno); 139 } 140 } 141 return 0; 142} 143 144#endif 145 146static int do_not_aio(struct aio_thread_req *req) 147{ 148 char c; 149 unsigned long long actual; 150 int n; 151 152 actual = lseek64(req->io_fd, req->offset, SEEK_SET); 153 if(actual != req->offset) 154 return -errno; 155 156 switch(req->type){ 157 case AIO_READ: 158 n = read(req->io_fd, req->buf, req->len); 159 break; 160 case AIO_WRITE: 161 n = write(req->io_fd, req->buf, req->len); 162 break; 163 case AIO_MMAP: 164 n = read(req->io_fd, &c, sizeof(c)); 165 break; 166 default: 167 printk("do_not_aio - bad request type : %d\n", req->type); 168 return -EINVAL; 169 } 170 171 if(n < 0) 172 return -errno; 173 return 0; 174} 175 176/* These are initialized in initcalls and not changed */ 177static int aio_req_fd_r = -1; 178static int aio_req_fd_w = -1; 179static int aio_pid = -1; 180 181static int not_aio_thread(void *arg) 182{ 183 struct aio_thread_req req; 184 struct aio_thread_reply reply; 185 int err; 186 187 signal(SIGWINCH, SIG_IGN); 188 while(1){ 189 err = read(aio_req_fd_r, &req, sizeof(req)); 190 if(err != sizeof(req)){ 191 if(err < 0) 192 printk("not_aio_thread - read failed, " 193 "fd = %d, err = %d\n", aio_req_fd_r, 194 errno); 195 else { 196 printk("not_aio_thread - short read, fd = %d, " 197 "length = %d\n", aio_req_fd_r, err); 198 } 199 continue; 200 } 201 err = do_not_aio(&req); 202 reply = ((struct aio_thread_reply) { .data = req.aio, 203 .err = err }); 204 err = write(req.aio->reply_fd, &reply, sizeof(reply)); 205 if(err != sizeof(reply)) 206 printk("not_aio_thread - write failed, fd = %d, " 207 "err = %d\n", req.aio->reply_fd, errno); 208 } 209 210 return 0; 211} 212 213static int init_aio_24(void) 214{ 215 unsigned long stack; 216 int fds[2], err; 217 218 err = os_pipe(fds, 1, 1); 219 if(err) 220 goto out; 221 222 aio_req_fd_w = fds[0]; 223 aio_req_fd_r = fds[1]; 224 225 err = os_set_fd_block(aio_req_fd_w, 0); 226 if(err) 227 goto out_close_pipe; 228 229 err = run_helper_thread(not_aio_thread, NULL, 230 CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0); 231 if(err < 0) 232 goto out_close_pipe; 233 234 aio_pid = err; 235 goto out; 236 237out_close_pipe: 238 os_close_file(fds[0]); 239 os_close_file(fds[1]); 240 aio_req_fd_w = -1; 241 aio_req_fd_r = -1; 242out: 243#ifndef HAVE_AIO_ABI 244 printk("/usr/include/linux/aio_abi.h not present during build\n"); 245#endif 246 printk("2.6 host AIO support not used - falling back to I/O " 247 "thread\n"); 248 return 0; 249} 250 251#ifdef HAVE_AIO_ABI 252#define DEFAULT_24_AIO 0 253static int init_aio_26(void) 254{ 255 unsigned long stack; 256 int err; 257 258 if(io_setup(256, &ctx)){ 259 err = -errno; 260 printk("aio_thread failed to initialize context, err = %d\n", 261 errno); 262 return err; 263 } 264 265 err = run_helper_thread(aio_thread, NULL, 266 CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0); 267 if(err < 0) 268 return err; 269 270 aio_pid = err; 271 272 printk("Using 2.6 host AIO\n"); 273 return 0; 274} 275 276static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len, 277 unsigned long long offset, struct aio_context *aio) 278{ 279 struct aio_thread_reply reply; 280 int err; 281 282 err = do_aio(ctx, type, io_fd, buf, len, offset, aio); 283 if(err){ 284 reply = ((struct aio_thread_reply) { .data = aio, 285 .err = err }); 286 err = write(aio->reply_fd, &reply, sizeof(reply)); 287 if(err != sizeof(reply)){ 288 err = -errno; 289 printk("submit_aio_26 - write failed, " 290 "fd = %d, err = %d\n", aio->reply_fd, -err); 291 } 292 else err = 0; 293 } 294 295 return err; 296} 297 298#else 299#define DEFAULT_24_AIO 1 300static int init_aio_26(void) 301{ 302 return -ENOSYS; 303} 304 305static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len, 306 unsigned long long offset, struct aio_context *aio) 307{ 308 return -ENOSYS; 309} 310#endif 311 312/* Initialized in an initcall and unchanged thereafter */ 313static int aio_24 = DEFAULT_24_AIO; 314 315static int __init set_aio_24(char *name, int *add) 316{ 317 aio_24 = 1; 318 return 0; 319} 320 321__uml_setup("aio=2.4", set_aio_24, 322"aio=2.4\n" 323" This is used to force UML to use 2.4-style AIO even when 2.6 AIO is\n" 324" available. 2.4 AIO is a single thread that handles one request at a\n" 325" time, synchronously. 2.6 AIO is a thread which uses the 2.6 AIO \n" 326" interface to handle an arbitrary number of pending requests. 2.6 AIO \n" 327" is not available in tt mode, on 2.4 hosts, or when UML is built with\n" 328" /usr/include/linux/aio_abi.h not available. Many distributions don't\n" 329" include aio_abi.h, so you will need to copy it from a kernel tree to\n" 330" your /usr/include/linux in order to build an AIO-capable UML\n\n" 331); 332 333static int init_aio(void) 334{ 335 int err; 336 337 CHOOSE_MODE(({ if(!aio_24){ 338 printk("Disabling 2.6 AIO in tt mode\n"); 339 aio_24 = 1; 340 } }), (void) 0); 341 342 if(!aio_24){ 343 err = init_aio_26(); 344 if(err && (errno == ENOSYS)){ 345 printk("2.6 AIO not supported on the host - " 346 "reverting to 2.4 AIO\n"); 347 aio_24 = 1; 348 } 349 else return err; 350 } 351 352 if(aio_24) 353 return init_aio_24(); 354 355 return 0; 356} 357 358/* The reason for the __initcall/__uml_exitcall asymmetry is that init_aio 359 * needs to be called when the kernel is running because it calls run_helper, 360 * which needs get_free_page. exit_aio is a __uml_exitcall because the generic 361 * kernel does not run __exitcalls on shutdown, and can't because many of them 362 * break when called outside of module unloading. 363 */ 364__initcall(init_aio); 365 366static void exit_aio(void) 367{ 368 if(aio_pid != -1) 369 os_kill_process(aio_pid, 1); 370} 371 372__uml_exitcall(exit_aio); 373 374static int submit_aio_24(enum aio_type type, int io_fd, char *buf, int len, 375 unsigned long long offset, struct aio_context *aio) 376{ 377 struct aio_thread_req req = { .type = type, 378 .io_fd = io_fd, 379 .offset = offset, 380 .buf = buf, 381 .len = len, 382 .aio = aio, 383 }; 384 int err; 385 386 err = write(aio_req_fd_w, &req, sizeof(req)); 387 if(err == sizeof(req)) 388 err = 0; 389 else err = -errno; 390 391 return err; 392} 393 394int submit_aio(enum aio_type type, int io_fd, char *buf, int len, 395 unsigned long long offset, int reply_fd, 396 struct aio_context *aio) 397{ 398 aio->reply_fd = reply_fd; 399 if(aio_24) 400 return submit_aio_24(type, io_fd, buf, len, offset, aio); 401 else { 402 return submit_aio_26(type, io_fd, buf, len, offset, aio); 403 } 404} 405