1/* SPDX-License-Identifier: MIT */
2
3#include <linux/io_uring.h>
4#include <sys/mman.h>
5#include <sys/syscall.h>
6#include <stdio.h>
7#include <string.h>
8#include <unistd.h>
9
10struct io_sq_ring {
11	unsigned int *head;
12	unsigned int *tail;
13	unsigned int *ring_mask;
14	unsigned int *ring_entries;
15	unsigned int *flags;
16	unsigned int *array;
17};
18
19struct io_cq_ring {
20	unsigned int *head;
21	unsigned int *tail;
22	unsigned int *ring_mask;
23	unsigned int *ring_entries;
24	struct io_uring_cqe *cqes;
25};
26
27struct io_uring_sq {
28	unsigned int *khead;
29	unsigned int *ktail;
30	unsigned int *kring_mask;
31	unsigned int *kring_entries;
32	unsigned int *kflags;
33	unsigned int *kdropped;
34	unsigned int *array;
35	struct io_uring_sqe *sqes;
36
37	unsigned int sqe_head;
38	unsigned int sqe_tail;
39
40	size_t ring_sz;
41};
42
43struct io_uring_cq {
44	unsigned int *khead;
45	unsigned int *ktail;
46	unsigned int *kring_mask;
47	unsigned int *kring_entries;
48	unsigned int *koverflow;
49	struct io_uring_cqe *cqes;
50
51	size_t ring_sz;
52};
53
54struct io_uring {
55	struct io_uring_sq sq;
56	struct io_uring_cq cq;
57	int ring_fd;
58};
59
60#if defined(__x86_64) || defined(__i386__)
61#define read_barrier()	__asm__ __volatile__("":::"memory")
62#define write_barrier()	__asm__ __volatile__("":::"memory")
63#else
64#define read_barrier()	__sync_synchronize()
65#define write_barrier()	__sync_synchronize()
66#endif
67
68static inline int io_uring_mmap(int fd, struct io_uring_params *p,
69				struct io_uring_sq *sq, struct io_uring_cq *cq)
70{
71	size_t size;
72	void *ptr;
73	int ret;
74
75	sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned int);
76	ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
77		   MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
78	if (ptr == MAP_FAILED)
79		return -errno;
80	sq->khead = ptr + p->sq_off.head;
81	sq->ktail = ptr + p->sq_off.tail;
82	sq->kring_mask = ptr + p->sq_off.ring_mask;
83	sq->kring_entries = ptr + p->sq_off.ring_entries;
84	sq->kflags = ptr + p->sq_off.flags;
85	sq->kdropped = ptr + p->sq_off.dropped;
86	sq->array = ptr + p->sq_off.array;
87
88	size = p->sq_entries * sizeof(struct io_uring_sqe);
89	sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
90			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
91	if (sq->sqes == MAP_FAILED) {
92		ret = -errno;
93err:
94		munmap(sq->khead, sq->ring_sz);
95		return ret;
96	}
97
98	cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
99	ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
100		   MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
101	if (ptr == MAP_FAILED) {
102		ret = -errno;
103		munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe));
104		goto err;
105	}
106	cq->khead = ptr + p->cq_off.head;
107	cq->ktail = ptr + p->cq_off.tail;
108	cq->kring_mask = ptr + p->cq_off.ring_mask;
109	cq->kring_entries = ptr + p->cq_off.ring_entries;
110	cq->koverflow = ptr + p->cq_off.overflow;
111	cq->cqes = ptr + p->cq_off.cqes;
112	return 0;
113}
114
115static inline int io_uring_setup(unsigned int entries,
116				 struct io_uring_params *p)
117{
118	return syscall(__NR_io_uring_setup, entries, p);
119}
120
121static inline int io_uring_enter(int fd, unsigned int to_submit,
122				 unsigned int min_complete,
123				 unsigned int flags, sigset_t *sig)
124{
125	return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
126		       flags, sig, _NSIG / 8);
127}
128
129static inline int io_uring_queue_init(unsigned int entries,
130				      struct io_uring *ring,
131				      unsigned int flags)
132{
133	struct io_uring_params p;
134	int fd, ret;
135
136	memset(ring, 0, sizeof(*ring));
137	memset(&p, 0, sizeof(p));
138	p.flags = flags;
139
140	fd = io_uring_setup(entries, &p);
141	if (fd < 0)
142		return fd;
143	ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq);
144	if (!ret)
145		ring->ring_fd = fd;
146	else
147		close(fd);
148	return ret;
149}
150
151/* Get a sqe */
152static inline struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
153{
154	struct io_uring_sq *sq = &ring->sq;
155
156	if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries)
157		return NULL;
158	return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask];
159}
160
161static inline int io_uring_wait_cqe(struct io_uring *ring,
162				    struct io_uring_cqe **cqe_ptr)
163{
164	struct io_uring_cq *cq = &ring->cq;
165	const unsigned int mask = *cq->kring_mask;
166	unsigned int head = *cq->khead;
167	int ret;
168
169	*cqe_ptr = NULL;
170	do {
171		read_barrier();
172		if (head != *cq->ktail) {
173			*cqe_ptr = &cq->cqes[head & mask];
174			break;
175		}
176		ret = io_uring_enter(ring->ring_fd, 0, 1,
177				     IORING_ENTER_GETEVENTS, NULL);
178		if (ret < 0)
179			return -errno;
180	} while (1);
181
182	return 0;
183}
184
185static inline int io_uring_submit(struct io_uring *ring)
186{
187	struct io_uring_sq *sq = &ring->sq;
188	const unsigned int mask = *sq->kring_mask;
189	unsigned int ktail, submitted, to_submit;
190	int ret;
191
192	read_barrier();
193	if (*sq->khead != *sq->ktail) {
194		submitted = *sq->kring_entries;
195		goto submit;
196	}
197	if (sq->sqe_head == sq->sqe_tail)
198		return 0;
199
200	ktail = *sq->ktail;
201	to_submit = sq->sqe_tail - sq->sqe_head;
202	for (submitted = 0; submitted < to_submit; submitted++) {
203		read_barrier();
204		sq->array[ktail++ & mask] = sq->sqe_head++ & mask;
205	}
206	if (!submitted)
207		return 0;
208
209	if (*sq->ktail != ktail) {
210		write_barrier();
211		*sq->ktail = ktail;
212		write_barrier();
213	}
214submit:
215	ret = io_uring_enter(ring->ring_fd, submitted, 0,
216			     IORING_ENTER_GETEVENTS, NULL);
217	return ret < 0 ? -errno : ret;
218}
219
220static inline void io_uring_queue_exit(struct io_uring *ring)
221{
222	struct io_uring_sq *sq = &ring->sq;
223
224	munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe));
225	munmap(sq->khead, sq->ring_sz);
226	close(ring->ring_fd);
227}
228
229/* Prepare and send the SQE */
230static inline void io_uring_prep_cmd(struct io_uring_sqe *sqe, int op,
231				     int sockfd,
232				     int level, int optname,
233				     const void *optval,
234				     int optlen)
235{
236	memset(sqe, 0, sizeof(*sqe));
237	sqe->opcode = (__u8)IORING_OP_URING_CMD;
238	sqe->fd = sockfd;
239	sqe->cmd_op = op;
240
241	sqe->level = level;
242	sqe->optname = optname;
243	sqe->optval = (unsigned long long)optval;
244	sqe->optlen = optlen;
245}
246
247static inline int io_uring_register_buffers(struct io_uring *ring,
248					    const struct iovec *iovecs,
249					    unsigned int nr_iovecs)
250{
251	int ret;
252
253	ret = syscall(__NR_io_uring_register, ring->ring_fd,
254		      IORING_REGISTER_BUFFERS, iovecs, nr_iovecs);
255	return (ret < 0) ? -errno : ret;
256}
257
258static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd,
259				      const void *buf, size_t len, int flags)
260{
261	memset(sqe, 0, sizeof(*sqe));
262	sqe->opcode = (__u8)IORING_OP_SEND;
263	sqe->fd = sockfd;
264	sqe->addr = (unsigned long)buf;
265	sqe->len = len;
266	sqe->msg_flags = (__u32)flags;
267}
268
269static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd,
270					const void *buf, size_t len, int flags,
271					unsigned int zc_flags)
272{
273	io_uring_prep_send(sqe, sockfd, buf, len, flags);
274	sqe->opcode = (__u8)IORING_OP_SEND_ZC;
275	sqe->ioprio = zc_flags;
276}
277
278static inline void io_uring_cqe_seen(struct io_uring *ring)
279{
280	*(&ring->cq)->khead += 1;
281	write_barrier();
282}
283