/* Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * ****************************************************************************** * * This implementation is based on a design by John Brooks (IBM Pok) which uses * the z/OS sockets async i/o facility. When a * socket is added to the pollset, an async poll is issued for that individual * socket. It specifies that the kernel should send an IPC message when the * socket becomes ready. The IPC messages are sent to a single message queue * that is part of the pollset. apr_pollset_poll waits on the arrival of IPC * messages or the specified timeout. * * Since z/OS does not support async i/o for pipes or files at present, this * implementation falls back to using ordinary poll() when * APR_POLLSET_THREADSAFE is unset. * * Greg Ames * April 2012 */ #include "apr.h" #include "apr_hash.h" #include "apr_poll.h" #include "apr_time.h" #include "apr_portable.h" #include "apr_arch_inherit.h" #include "apr_arch_file_io.h" #include "apr_arch_networkio.h" #include "apr_arch_poll_private.h" #ifdef HAVE_AIO_MSGQ #include /* msgget etc */ #include /* timestruct */ #include /* pollfd */ #include /* MAX_INT */ struct apr_pollset_private_t { int msg_q; /* IPC message queue. The z/OS kernel sends messages * to this queue when our async polls on individual * file descriptors complete */ apr_pollfd_t *result_set; apr_uint32_t size; #if APR_HAS_THREADS /* A thread mutex to protect operations on the rings and the hash */ apr_thread_mutex_t *ring_lock; #endif /* A hash of all active elements used for O(1) _remove operations */ apr_hash_t *elems; APR_RING_HEAD(ready_ring_t, asio_elem_t) ready_ring; APR_RING_HEAD(prior_ready_ring_t, asio_elem_t) prior_ready_ring; APR_RING_HEAD(free_ring_t, asio_elem_t) free_ring; /* for pipes etc with no asio */ struct pollfd *pollset; apr_pollfd_t *query_set; }; typedef enum { ASIO_INIT = 0, ASIO_REMOVED, ASIO_COMPLETE } asio_state_e; typedef struct asio_elem_t asio_elem_t; struct asio_msgbuf_t { long msg_type; /* must be > 0 */ asio_elem_t *msg_elem; }; struct asio_elem_t { APR_RING_ENTRY(asio_elem_t) link; apr_pollfd_t pfd; struct pollfd os_pfd; struct aiocb a; asio_state_e state; struct asio_msgbuf_t msg; }; #define DEBUG 0 /* DEBUG settings: 0 - no debug messages at all, * 1 - should not occur messages, * 2 - apr_pollset_* entry and exit messages, * 3 - state changes, memory usage, * 4 - z/OS, APR, and internal calls, * 5 - everything else except the timer pop path, * 6 - everything, including the Event 1 sec timer pop path * * each DEBUG level includes all messages produced by lower numbered levels */ #if DEBUG #include #include /* getpid */ #define DBG_BUFF char dbg_msg_buff[256]; #define DBG_TEST(lvl) if (lvl <= DEBUG) { #define DBG_CORE(msg) sprintf(dbg_msg_buff, "% 8d " __FUNCTION__ \ " " msg, getpid()), \ fprintf(stderr, "%s", dbg_msg_buff); #define DBG_CORE1(msg, var1) sprintf(dbg_msg_buff, "% 8d " __FUNCTION__ \ " " msg, getpid(), var1), \ fprintf(stderr, "%s", dbg_msg_buff); #define DBG_CORE2(msg, var1, var2) sprintf(dbg_msg_buff, "% 8d " __FUNCTION__ \ " " msg, getpid(), var1, var2), \ fprintf(stderr, "%s", dbg_msg_buff); #define DBG_CORE3(msg, var1, var2, var3) \ sprintf(dbg_msg_buff, "% 8d " __FUNCTION__ \ " " msg, getpid(), var1, var2, var3), \ fprintf(stderr, "%s", dbg_msg_buff); #define DBG_CORE4(msg, var1, var2, var3, var4) \ sprintf(dbg_msg_buff, "% 8d " __FUNCTION__ \ " " msg, getpid(), var1, var2, var3, var4),\ fprintf(stderr, "%s", dbg_msg_buff); #define DBG_END } #define DBG(lvl, msg) DBG_TEST(lvl) \ DBG_CORE(msg) \ DBG_END #define DBG1(lvl, msg, var1) DBG_TEST(lvl) \ DBG_CORE1(msg, var1) \ DBG_END #define DBG2(lvl, msg, var1, var2) DBG_TEST(lvl) \ DBG_CORE2(msg, var1, var2) \ DBG_END #define DBG3(lvl, msg, var1, var2, var3) \ DBG_TEST(lvl) \ DBG_CORE3(msg, var1, var2, var3) \ DBG_END #define DBG4(lvl, msg, var1, var2, var3, var4) \ DBG_TEST(lvl) \ DBG_CORE4(msg, var1, var2, var3, var4) \ DBG_END #else /* DEBUG is 0 */ #define DBG_BUFF #define DBG(lvl, msg) ((void)0) #define DBG1(lvl, msg, var1) ((void)0) #define DBG2(lvl, msg, var1, var2) ((void)0) #define DBG3(lvl, msg, var1, var2, var3) ((void)0) #define DBG4(lvl, msg, var1, var2, var3, var4) ((void)0) #endif /* DEBUG */ static int asyncio(struct aiocb *a) { DBG_BUFF int rv; #ifdef _LP64 #define AIO BPX4AIO #else #define AIO BPX1AIO #endif AIO(sizeof(struct aiocb), a, &rv, &errno, __err2ad()); DBG2(4, "BPX4AIO aiocb %p rv %d\n", a, rv); #ifdef DEBUG if (rv < 0) { DBG2(4, "errno %d errnojr %08x\n", errno, *__err2ad()); } #endif return rv; } static apr_int16_t get_event(apr_int16_t event) { DBG_BUFF apr_int16_t rv = 0; DBG(4, "entered\n"); if (event & APR_POLLIN) rv |= POLLIN; if (event & APR_POLLPRI) rv |= POLLPRI; if (event & APR_POLLOUT) rv |= POLLOUT; if (event & APR_POLLERR) rv |= POLLERR; if (event & APR_POLLHUP) rv |= POLLHUP; if (event & APR_POLLNVAL) rv |= POLLNVAL; DBG(4, "exiting\n"); return rv; } static apr_int16_t get_revent(apr_int16_t event) { DBG_BUFF apr_int16_t rv = 0; DBG(4, "entered\n"); if (event & POLLIN) rv |= APR_POLLIN; if (event & POLLPRI) rv |= APR_POLLPRI; if (event & POLLOUT) rv |= APR_POLLOUT; if (event & POLLERR) rv |= APR_POLLERR; if (event & POLLHUP) rv |= APR_POLLHUP; if (event & POLLNVAL) rv |= APR_POLLNVAL; DBG(4, "exiting\n"); return rv; } static apr_status_t asio_pollset_cleanup(apr_pollset_t *pollset) { DBG_BUFF int rv; DBG(4, "entered\n"); if (pollset->flags & APR_POLLSET_THREADSAFE) { rv = msgctl(pollset->p->msg_q, IPC_RMID, NULL); DBG1(4, "asio_pollset_cleanup: msgctl(IPC_RMID) returned %d\n", rv); } return rv; } static apr_status_t asio_pollset_create(apr_pollset_t *pollset, apr_uint32_t size, apr_pool_t *p, apr_uint32_t flags) { DBG_BUFF apr_status_t rv; apr_pollset_private_t *priv; DBG1(2, "entered, flags: %x\n", flags); priv = pollset->p = apr_pcalloc(p, sizeof(*priv)); if (flags & APR_POLLSET_THREADSAFE) { #if APR_HAS_THREADS if ((rv = apr_thread_mutex_create(&(priv->ring_lock), APR_THREAD_MUTEX_DEFAULT, p)) != APR_SUCCESS) { DBG1(1, "apr_thread_mutex_create returned %d\n", rv); pollset->p = NULL; return rv; } rv = msgget(IPC_PRIVATE, S_IWUSR+S_IRUSR); /* user r/w perms */ if (rv < 0) { #if DEBUG perror(__FUNCTION__ " msgget returned < 0 "); #endif pollset->p = NULL; return rv; } DBG2(4, "pollset %p msgget was OK, rv=%d\n", pollset, rv); priv->msg_q = rv; priv->elems = apr_hash_make(p); APR_RING_INIT(&priv->free_ring, asio_elem_t, link); APR_RING_INIT(&priv->prior_ready_ring, asio_elem_t, link); #else /* APR doesn't have threads but caller wants a threadsafe pollset */ pollset->p = NULL; return APR_ENOTIMPL; #endif } else { /* APR_POLLSET_THREADSAFE not set, i.e. no async i/o, * init fields only needed in old style pollset */ priv->pollset = apr_palloc(p, size * sizeof(struct pollfd)); priv->query_set = apr_palloc(p, size * sizeof(apr_pollfd_t)); if ((!priv->pollset) || (!priv->query_set)) { pollset->p = NULL; return APR_ENOMEM; } } pollset->nelts = 0; pollset->flags = flags; pollset->pool = p; priv->size = size; priv->result_set = apr_palloc(p, size * sizeof(apr_pollfd_t)); if (!priv->result_set) { if (flags & APR_POLLSET_THREADSAFE) { msgctl(priv->msg_q, IPC_RMID, NULL); } pollset->p = NULL; return APR_ENOMEM; } DBG2(2, "exiting, pollset: %p, type: %s\n", pollset, flags & APR_POLLSET_THREADSAFE ? "async" : "POSIX"); return APR_SUCCESS; } /* end of asio_pollset_create */ static apr_status_t posix_add(apr_pollset_t *pollset, const apr_pollfd_t *descriptor) { DBG_BUFF int fd; apr_pool_t *p = pollset->pool; apr_pollset_private_t *priv = pollset->p; DBG(4, "entered\n"); if (pollset->nelts == priv->size) { return APR_ENOMEM; } priv->query_set[pollset->nelts] = *descriptor; if (descriptor->desc_type == APR_POLL_SOCKET) { fd = descriptor->desc.s->socketdes; } else { fd = descriptor->desc.f->filedes; } priv->pollset[pollset->nelts].fd = fd; priv->pollset[pollset->nelts].events = get_event(descriptor->reqevents); pollset->nelts++; DBG2(4, "exiting, fd %d added to pollset %p\n", fd, pollset); return APR_SUCCESS; } /* end of posix_add */ static apr_status_t asio_pollset_add(apr_pollset_t *pollset, const apr_pollfd_t *descriptor) { DBG_BUFF asio_elem_t *elem; apr_status_t rv = APR_SUCCESS; apr_pollset_private_t *priv = pollset->p; pollset_lock_rings(); DBG(2, "entered\n"); if (pollset->flags & APR_POLLSET_THREADSAFE) { if (!APR_RING_EMPTY(&(priv->free_ring), asio_elem_t, link)) { elem = APR_RING_FIRST(&(priv->free_ring)); APR_RING_REMOVE(elem, link); DBG1(3, "used recycled memory at %08p\n", elem); elem->state = ASIO_INIT; elem->a.aio_cflags = 0; } else { elem = (asio_elem_t *) apr_pcalloc(pollset->pool, sizeof(asio_elem_t)); DBG1(3, "alloced new memory at %08p\n", elem); elem->a.aio_notifytype = AIO_MSGQ; elem->a.aio_msgev_qid = priv->msg_q; DBG1(5, "aio_msgev_quid = %d \n", elem->a.aio_msgev_qid); elem->a.aio_msgev_size = sizeof(asio_elem_t *); elem->a.aio_msgev_flag = 0; /* wait if queue is full */ elem->a.aio_msgev_addr = &(elem->msg); elem->a.aio_buf = &(elem->os_pfd); elem->a.aio_nbytes = 1; /* number of pfds to poll */ elem->msg.msg_type = 1; elem->msg.msg_elem = elem; } /* z/OS only supports async I/O for sockets for now */ elem->os_pfd.fd = descriptor->desc.s->socketdes; APR_RING_ELEM_INIT(elem, link); elem->a.aio_cmd = AIO_SELPOLL; elem->a.aio_cflags &= ~AIO_OK2COMPIMD; /* not OK to complete inline*/ elem->pfd = *descriptor; elem->os_pfd.events = get_event(descriptor->reqevents); if (0 != asyncio(&elem->a)) { rv = errno; DBG3(4, "pollset %p asio failed fd %d, errno %p\n", pollset, elem->os_pfd.fd, rv); #if DEBUG perror(__FUNCTION__ " asio failure"); #endif } else { DBG2(4, "good asio call, adding fd %d to pollset %p\n", elem->os_pfd.fd, pollset); pollset->nelts++; apr_hash_set(priv->elems, &(elem->os_pfd.fd), sizeof(int), elem); } } else { /* APR_POLLSET_THREADSAFE isn't set. use POSIX poll in case * pipes or files are used with this pollset */ rv = posix_add(pollset, descriptor); } DBG1(2, "exiting, rv = %d\n", rv); pollset_unlock_rings(); return rv; } /* end of asio_pollset_add */ static posix_remove(apr_pollset_t *pollset, const apr_pollfd_t *descriptor) { DBG_BUFF apr_uint32_t i; apr_pollset_private_t *priv = pollset->p; DBG(4, "entered\n"); for (i = 0; i < pollset->nelts; i++) { if (descriptor->desc.s == priv->query_set[i].desc.s) { /* Found an instance of the fd: remove this and any other copies */ apr_uint32_t dst = i; apr_uint32_t old_nelts = pollset->nelts; pollset->nelts--; for (i++; i < old_nelts; i++) { if (descriptor->desc.s == priv->query_set[i].desc.s) { pollset->nelts--; } else { priv->pollset[dst] = priv->pollset[i]; priv->query_set[dst] = priv->query_set[i]; dst++; } } DBG(4, "returning OK\n"); return APR_SUCCESS; } } DBG(1, "returning APR_NOTFOUND\n"); return APR_NOTFOUND; } /* end of posix_remove */ static apr_status_t asio_pollset_remove(apr_pollset_t *pollset, const apr_pollfd_t *descriptor) { DBG_BUFF asio_elem_t *elem; apr_status_t rv = APR_SUCCESS; apr_pollset_private_t *priv = pollset->p; /* AIO_CANCEL is synchronous, so autodata works fine. */ struct aiocb cancel_a = {0}; int fd; DBG(2, "entered\n"); if (!(pollset->flags & APR_POLLSET_THREADSAFE)) { return posix_remove(pollset, descriptor); } pollset_lock_rings(); #if DEBUG assert(descriptor->desc_type == APR_POLL_SOCKET); #endif /* zOS 1.12 doesn't support files for async i/o */ fd = descriptor->desc.s->socketdes; elem = apr_hash_get(priv->elems, &(fd), sizeof(int)); if (elem == NULL) { DBG1(1, "couldn't find fd %d\n", fd); rv = APR_NOTFOUND; } else { DBG1(5, "hash found fd %d\n", fd); /* delete this fd from the hash */ apr_hash_set(priv->elems, &(fd), sizeof(int), NULL); if (elem->state == ASIO_INIT) { /* asyncio call to cancel */ cancel_a.aio_cmd = AIO_CANCEL; cancel_a.aio_buf = &elem->a; /* point to original aiocb */ cancel_a.aio_cflags = 0; cancel_a.aio_cflags2 = 0; /* we want the original aiocb to show up on the pollset message queue * before recycling its memory to eliminate race conditions */ rv = asyncio(&cancel_a); DBG1(4, "asyncio returned %d\n", rv); #if DEBUG assert(rv == 1); #endif } elem->state = ASIO_REMOVED; rv = APR_SUCCESS; } DBG1(2, "exiting, rv: %d\n", rv); pollset_unlock_rings(); return rv; } /* end of asio_pollset_remove */ static posix_poll(apr_pollset_t *pollset, apr_interval_time_t timeout, apr_int32_t *num, const apr_pollfd_t **descriptors) { DBG_BUFF int rv; apr_uint32_t i, j; apr_pollset_private_t *priv = pollset->p; DBG(4, "entered\n"); if (timeout > 0) { timeout /= 1000; } rv = poll(priv->pollset, pollset->nelts, timeout); (*num) = rv; if (rv < 0) { return apr_get_netos_error(); } if (rv == 0) { return APR_TIMEUP; } j = 0; for (i = 0; i < pollset->nelts; i++) { if (priv->pollset[i].revents != 0) { priv->result_set[j] = priv->query_set[i]; priv->result_set[j].rtnevents = get_revent(priv->pollset[i].revents); j++; } } if (descriptors) *descriptors = priv->result_set; DBG(4, "exiting ok\n"); return APR_SUCCESS; } /* end of posix_poll */ static process_msg(apr_pollset_t *pollset, struct asio_msgbuf_t *msg) { DBG_BUFF asio_elem_t *elem = msg->msg_elem; switch(elem->state) { case ASIO_REMOVED: DBG2(5, "for cancelled elem, recycling memory - elem %08p, fd %d\n", elem, elem->os_pfd.fd); APR_RING_INSERT_TAIL(&(pollset->p->free_ring), elem, asio_elem_t, link); break; case ASIO_INIT: DBG2(4, "adding to ready ring: elem %08p, fd %d\n", elem, elem->os_pfd.fd); elem->state = ASIO_COMPLETE; APR_RING_INSERT_TAIL(&(pollset->p->ready_ring), elem, asio_elem_t, link); break; default: DBG3(1, "unexpected state: elem %08p, fd %d, state %d\n", elem, elem->os_pfd.fd, elem->state); #if DEBUG assert(0); #endif } } static apr_status_t asio_pollset_poll(apr_pollset_t *pollset, apr_interval_time_t timeout, apr_int32_t *num, const apr_pollfd_t **descriptors) { DBG_BUFF int i, ret; asio_elem_t *elem, *next_elem; struct asio_msgbuf_t msg_buff; struct timespec tv; apr_status_t rv = APR_SUCCESS; apr_pollset_private_t *priv = pollset->p; DBG(6, "entered\n"); /* chatty - traces every second w/Event */ if ((pollset->flags & APR_POLLSET_THREADSAFE) == 0 ) { return posix_poll(pollset, timeout, num, descriptors); } pollset_lock_rings(); APR_RING_INIT(&(priv->ready_ring), asio_elem_t, link); while (!APR_RING_EMPTY(&(priv->prior_ready_ring), asio_elem_t, link)) { elem = APR_RING_FIRST(&(priv->prior_ready_ring)); DBG3(5, "pollset %p elem %p fd %d on prior ready ring\n", pollset, elem, elem->os_pfd.fd); APR_RING_REMOVE(elem, link); /* * since USS does not remember what's in our pollset, we have * to re-add fds which have not been apr_pollset_remove'd * * there may have been too many ready fd's to return in the * result set last time. re-poll inline for both cases */ if (elem->state == ASIO_REMOVED) { /* * async i/o is done since it was found on prior_ready * the state says the caller is done with it too * so recycle the elem */ APR_RING_INSERT_TAIL(&(priv->free_ring), elem, asio_elem_t, link); continue; /* do not re-add if it has been _removed */ } elem->state = ASIO_INIT; elem->a.aio_cflags = AIO_OK2COMPIMD; if (0 != (ret = asyncio(&elem->a))) { if (ret == 1) { DBG(4, "asyncio() completed inline\n"); /* it's ready now */ elem->state = ASIO_COMPLETE; APR_RING_INSERT_TAIL(&(priv->ready_ring), elem, asio_elem_t, link); } else { DBG2(1, "asyncio() failed, ret: %d, errno: %d\n", ret, errno); pollset_unlock_rings(); return errno; } } DBG1(4, "asyncio() completed rc %d\n", ret); } DBG(6, "after prior ready loop\n"); /* chatty w/timeouts, hence 6 */ /* Gather async poll completions that have occurred since the last call */ while (0 < msgrcv(priv->msg_q, &msg_buff, sizeof(asio_elem_t *), 0, IPC_NOWAIT)) { process_msg(pollset, &msg_buff); } /* Suspend if nothing is ready yet. */ if (APR_RING_EMPTY(&(priv->ready_ring), asio_elem_t, link)) { if (timeout >= 0) { tv.tv_sec = apr_time_sec(timeout); tv.tv_nsec = apr_time_usec(timeout) * 1000; } else { tv.tv_sec = INT_MAX; /* block until something is ready */ } DBG2(6, "nothing on the ready ring " "- blocking for %d seconds %d ns\n", tv.tv_sec, tv.tv_nsec); pollset_unlock_rings(); /* allow other apr_pollset_* calls while blocked */ if (0 >= (ret = __msgrcv_timed(priv->msg_q, &msg_buff, sizeof(asio_elem_t *), 0, NULL, &tv))) { #if DEBUG if (errno == EAGAIN) { DBG(6, "__msgrcv_timed timed out\n"); /* timeout path, so 6 */ } else { DBG(1, "__msgrcv_timed failed!\n"); } #endif return (errno == EAGAIN) ? APR_TIMEUP : errno; } pollset_lock_rings(); process_msg(pollset, &msg_buff); } APR_RING_INIT(&priv->prior_ready_ring, asio_elem_t, link); (*num) = 0; elem = APR_RING_FIRST(&(priv->ready_ring)); for (i = 0; i < priv->size && elem != APR_RING_SENTINEL(&(priv->ready_ring), asio_elem_t, link); i++) { DBG2(5, "ready ring: elem %08p, fd %d\n", elem, elem->os_pfd.fd); priv->result_set[i] = elem->pfd; priv->result_set[i].rtnevents = get_revent(elem->os_pfd.revents); (*num)++; elem = APR_RING_NEXT(elem, link); #if DEBUG if (elem == APR_RING_SENTINEL(&(priv->ready_ring), asio_elem_t, link)) { DBG(5, "end of ready ring reached\n"); } #endif } if (descriptors) { *descriptors = priv->result_set; } /* if the result size is too small, remember which descriptors * haven't had results reported yet. we will look * at these descriptors on the next apr_pollset_poll call */ APR_RING_CONCAT(&priv->prior_ready_ring, &(priv->ready_ring), asio_elem_t, link); DBG1(2, "exiting, rv = %d\n", rv); pollset_unlock_rings(); return rv; } /* end of asio_pollset_poll */ static const apr_pollset_provider_t impl = { asio_pollset_create, asio_pollset_add, asio_pollset_remove, asio_pollset_poll, asio_pollset_cleanup, "asio" }; const apr_pollset_provider_t *apr_pollset_provider_aio_msgq = &impl; #endif /* HAVE_AIO_MSGQ */