1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2016-2017 Netflix, Inc.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 */
28
29#include <sys/param.h>
30#include <sys/conf.h>
31#include <sys/fcntl.h>
32#include <sys/filio.h>
33#include <sys/kernel.h>
34#include <sys/lock.h>
35#include <sys/malloc.h>
36#include <sys/module.h>
37#include <sys/poll.h>
38#include <sys/queue.h>
39#include <sys/refcount.h>
40#include <sys/mutex.h>
41#include <sys/selinfo.h>
42#include <sys/socket.h>
43#include <sys/socketvar.h>
44#include <sys/sysctl.h>
45#include <sys/tree.h>
46#include <sys/uio.h>
47#include <machine/atomic.h>
48#include <sys/counter.h>
49
50#include <dev/tcp_log/tcp_log_dev.h>
51
52#ifdef TCPLOG_DEBUG_COUNTERS
53extern counter_u64_t tcp_log_que_read;
54extern counter_u64_t tcp_log_que_freed;
55#endif
56
57static struct cdev *tcp_log_dev;
58static struct selinfo tcp_log_sel;
59
60static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head);
61static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head);
62
63MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures");
64
65static int	tcp_log_dev_listeners = 0;
66
67static struct mtx tcp_log_dev_queue_lock;
68
69#define	TCP_LOG_DEV_QUEUE_LOCK()	mtx_lock(&tcp_log_dev_queue_lock)
70#define	TCP_LOG_DEV_QUEUE_UNLOCK()	mtx_unlock(&tcp_log_dev_queue_lock)
71#define	TCP_LOG_DEV_QUEUE_LOCK_ASSERT()	mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED)
72#define	TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED)
73#define	TCP_LOG_DEV_QUEUE_REF(tldq)	refcount_acquire(&((tldq)->tldq_refcnt))
74#define	TCP_LOG_DEV_QUEUE_UNREF(tldq)	refcount_release(&((tldq)->tldq_refcnt))
75
76static void	tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry);
77static void	tcp_log_dev_clear_cdevpriv(void *data);
78static int	tcp_log_dev_open(struct cdev *dev __unused, int flags,
79    int devtype __unused, struct thread *td __unused);
80static int	tcp_log_dev_write(struct cdev *dev __unused,
81    struct uio *uio __unused, int flags __unused);
82static int	tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio,
83    int flags __unused);
84static int	tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd,
85    caddr_t data, int fflag __unused, struct thread *td __unused);
86static int	tcp_log_dev_poll(struct cdev *dev __unused, int events,
87    struct thread *td);
88
89enum tcp_log_dev_queue_lock_state {
90	QUEUE_UNLOCKED = 0,
91	QUEUE_LOCKED,
92};
93
94static struct cdevsw tcp_log_cdevsw = {
95	.d_version =	D_VERSION,
96	.d_read =	tcp_log_dev_read,
97	.d_open =	tcp_log_dev_open,
98	.d_write =	tcp_log_dev_write,
99	.d_poll =	tcp_log_dev_poll,
100	.d_ioctl =	tcp_log_dev_ioctl,
101#ifdef NOTYET
102	.d_mmap =	tcp_log_dev_mmap,
103#endif
104	.d_name =	"tcp_log",
105};
106
107static __inline void
108tcp_log_dev_queue_validate_lock(int lockstate)
109{
110
111#ifdef INVARIANTS
112	switch (lockstate) {
113	case QUEUE_LOCKED:
114		TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
115		break;
116	case QUEUE_UNLOCKED:
117		TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT();
118		break;
119	default:
120		kassert_panic("%s:%d: unknown queue lock state", __func__,
121		    __LINE__);
122	}
123#endif
124}
125
126/*
127 * Clear the refcount. If appropriate, it will remove the entry from the
128 * queue and call the destructor.
129 *
130 * This must be called with the queue lock held.
131 */
132static void
133tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry)
134{
135
136	KASSERT(entry != NULL, ("%s: called with NULL entry", __func__));
137
138	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
139
140	if (TCP_LOG_DEV_QUEUE_UNREF(entry)) {
141#ifdef TCPLOG_DEBUG_COUNTERS
142		counter_u64_add(tcp_log_que_freed, 1);
143#endif
144		/* Remove the entry from the queue and call the destructor. */
145		STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue,
146		    tldq_queue);
147		(*entry->tldq_dtor)(entry);
148	}
149}
150
151static void
152tcp_log_dev_clear_cdevpriv(void *data)
153{
154	struct tcp_log_dev_info *priv;
155	struct tcp_log_dev_queue *entry, *entry_tmp;
156
157	priv = (struct tcp_log_dev_info *)data;
158	if (priv == NULL)
159		return;
160
161	/*
162	 * Lock the queue and drop our references. We hold references to all
163	 * the entries starting with tldi_head (or, if tldi_head == NULL, all
164	 * entries in the queue).
165	 *
166	 * Because we don't want anyone adding addition things to the queue
167	 * while we are doing this, we lock the queue.
168	 */
169	TCP_LOG_DEV_QUEUE_LOCK();
170	if (priv->tldi_head != NULL) {
171		entry = priv->tldi_head;
172		STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head,
173		    tldq_queue, entry_tmp) {
174			tcp_log_dev_clear_refcount(entry);
175		}
176	}
177	tcp_log_dev_listeners--;
178	KASSERT(tcp_log_dev_listeners >= 0,
179	    ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__));
180	STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info,
181	    tldi_list);
182	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
183	TCP_LOG_DEV_QUEUE_UNLOCK();
184	free(priv, M_TCPLOGDEV);
185}
186
187static int
188tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused,
189    struct thread *td __unused)
190{
191	struct tcp_log_dev_info *priv;
192	struct tcp_log_dev_queue *entry;
193	int rv;
194
195	/*
196	 * Ideally, we shouldn't see these because of file system
197	 * permissions.
198	 */
199	if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC))
200		return (ENODEV);
201
202	/* Allocate space to hold information about where we are. */
203	priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV,
204	    M_ZERO | M_WAITOK);
205
206	/* Stash the private data away. */
207	rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv);
208	if (!rv) {
209		/*
210		 * Increase the listener count, add this reader to the list, and
211		 * take references on all current queues.
212		 */
213		TCP_LOG_DEV_QUEUE_LOCK();
214		tcp_log_dev_listeners++;
215		STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list);
216		priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head);
217		if (priv->tldi_head != NULL)
218			priv->tldi_cur = priv->tldi_head->tldq_buf;
219		STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue)
220			TCP_LOG_DEV_QUEUE_REF(entry);
221		TCP_LOG_DEV_QUEUE_UNLOCK();
222	} else {
223		/* Free the entry. */
224		free(priv, M_TCPLOGDEV);
225	}
226	return (rv);
227}
228
229static int
230tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused,
231    int flags __unused)
232{
233
234	return (ENODEV);
235}
236
237static __inline void
238tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate)
239{
240	struct tcp_log_dev_queue *entry;
241
242	KASSERT(priv->tldi_head != NULL,
243	    ("%s:%d: priv->tldi_head unexpectedly NULL",
244	    __func__, __LINE__));
245	KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur,
246	    ("%s:%d: buffer mismatch (%p vs %p)",
247	    __func__, __LINE__, priv->tldi_head->tldq_buf,
248	    priv->tldi_cur));
249	tcp_log_dev_queue_validate_lock(*lockstate);
250
251	if (*lockstate == QUEUE_UNLOCKED) {
252		TCP_LOG_DEV_QUEUE_LOCK();
253		*lockstate = QUEUE_LOCKED;
254	}
255	entry = priv->tldi_head;
256	priv->tldi_head = STAILQ_NEXT(entry, tldq_queue);
257	tcp_log_dev_clear_refcount(entry);
258	priv->tldi_cur = NULL;
259}
260
261static int
262tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags)
263{
264	struct tcp_log_common_header *buf;
265	struct tcp_log_dev_info *priv;
266	struct tcp_log_dev_queue *entry;
267	ssize_t len;
268	int lockstate, rv;
269
270	/* Get our private info. */
271	rv = devfs_get_cdevpriv((void **)&priv);
272	if (rv)
273		return (rv);
274
275	lockstate = QUEUE_UNLOCKED;
276
277	/* Do we need to get a new buffer? */
278	while (priv->tldi_cur == NULL ||
279	    priv->tldi_cur->tlch_length <= priv->tldi_off) {
280		/* Did we somehow forget to rotate? */
281		KASSERT(priv->tldi_cur == NULL,
282		    ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__,
283		    __LINE__));
284		if (priv->tldi_cur != NULL)
285			tcp_log_dev_rotate_bufs(priv, &lockstate);
286
287		/*
288		 * Before we start looking at tldi_head, we need a lock on the
289		 * queue to make sure tldi_head stays stable.
290		 */
291		if (lockstate == QUEUE_UNLOCKED) {
292			TCP_LOG_DEV_QUEUE_LOCK();
293			lockstate = QUEUE_LOCKED;
294		}
295
296		/* We need the next buffer. Do we have one? */
297		if (priv->tldi_head == NULL && (flags & FNONBLOCK)) {
298			rv = EAGAIN;
299			goto done;
300		}
301		if (priv->tldi_head == NULL) {
302			/* Sleep and wait for more things we can read. */
303			rv = mtx_sleep(&tcp_log_dev_listeners,
304			    &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0);
305			if (rv)
306				goto done;
307			if (priv->tldi_head == NULL)
308				continue;
309		}
310
311		/*
312		 * We have an entry to read. We want to try to create a
313		 * buffer, if one doesn't already exist.
314		 */
315		entry = priv->tldi_head;
316		if (entry->tldq_buf == NULL) {
317			TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
318			buf = (*entry->tldq_xform)(entry);
319			if (buf == NULL) {
320				rv = EBUSY;
321				goto done;
322			}
323			entry->tldq_buf = buf;
324		}
325
326		priv->tldi_cur = entry->tldq_buf;
327		priv->tldi_off = 0;
328	}
329
330	/* Copy what we can from this buffer to the output buffer. */
331	if (uio->uio_resid > 0) {
332		/* Drop locks so we can take page faults. */
333		if (lockstate == QUEUE_LOCKED)
334			TCP_LOG_DEV_QUEUE_UNLOCK();
335		lockstate = QUEUE_UNLOCKED;
336
337		KASSERT(priv->tldi_cur != NULL,
338		    ("%s: priv->tldi_cur is unexpectedly NULL", __func__));
339
340		/* Copy as much as we can to this uio. */
341		len = priv->tldi_cur->tlch_length - priv->tldi_off;
342		if (len > uio->uio_resid)
343			len = uio->uio_resid;
344		rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off,
345		    len, uio);
346		if (rv != 0)
347			goto done;
348		priv->tldi_off += len;
349#ifdef TCPLOG_DEBUG_COUNTERS
350		counter_u64_add(tcp_log_que_read, len);
351#endif
352	}
353	/* Are we done with this buffer? If so, find the next one. */
354	if (priv->tldi_off >= priv->tldi_cur->tlch_length) {
355		KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length,
356		    ("%s: offset (%ju) exceeds length (%ju)", __func__,
357		    (uintmax_t)priv->tldi_off,
358		    (uintmax_t)priv->tldi_cur->tlch_length));
359		tcp_log_dev_rotate_bufs(priv, &lockstate);
360	}
361done:
362	tcp_log_dev_queue_validate_lock(lockstate);
363	if (lockstate == QUEUE_LOCKED)
364		TCP_LOG_DEV_QUEUE_UNLOCK();
365	return (rv);
366}
367
368static int
369tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
370    int fflag __unused, struct thread *td __unused)
371{
372	struct tcp_log_dev_info *priv;
373	int rv;
374
375	/* Get our private info. */
376	rv = devfs_get_cdevpriv((void **)&priv);
377	if (rv)
378		return (rv);
379
380	/*
381	 * Set things. Here, we are most concerned about the non-blocking I/O
382	 * flag.
383	 */
384	rv = 0;
385	switch (cmd) {
386	case FIONBIO:
387		break;
388	case FIOASYNC:
389		if (*(int *)data != 0)
390			rv = EINVAL;
391		break;
392	default:
393		rv = ENOIOCTL;
394	}
395	return (rv);
396}
397
398static int
399tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td)
400{
401	struct tcp_log_dev_info *priv;
402	int revents;
403
404	/*
405	 * Get our private info. If this fails, claim that all events are
406	 * ready. That should prod the user to do something that will
407	 * make the error evident to them.
408	 */
409	if (devfs_get_cdevpriv((void **)&priv))
410		return (events);
411
412	revents = 0;
413	if (events & (POLLIN | POLLRDNORM)) {
414		/*
415		 * We can (probably) read right now if we are partway through
416		 * a buffer or if we are just about to start a buffer.
417		 * Because we are going to read tldi_head, we should acquire
418		 * a read lock on the queue.
419		 */
420		TCP_LOG_DEV_QUEUE_LOCK();
421		if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) ||
422		    (priv->tldi_cur != NULL &&
423		    priv->tldi_off < priv->tldi_cur->tlch_length))
424			revents = events & (POLLIN | POLLRDNORM);
425		else
426			selrecord(td, &tcp_log_sel);
427		TCP_LOG_DEV_QUEUE_UNLOCK();
428	} else {
429		/*
430		 * It only makes sense to poll for reading. So, again, prod the
431		 * user to do something that will make the error of their ways
432		 * apparent.
433		 */
434		revents = events;
435	}
436	return (revents);
437}
438
439int
440tcp_log_dev_add_log(struct tcp_log_dev_queue *entry)
441{
442	struct tcp_log_dev_info *priv;
443	int rv;
444	bool wakeup_needed;
445
446	KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL,
447	    ("%s: Called with both tldq_buf and tldq_xform set to NULL",
448	    __func__));
449	KASSERT(entry->tldq_dtor != NULL,
450	    ("%s: Called with tldq_dtor set to NULL", __func__));
451
452	/* Get a lock on the queue. */
453	TCP_LOG_DEV_QUEUE_LOCK();
454
455	/* If no one is listening, tell the caller to free the resources. */
456	if (tcp_log_dev_listeners == 0) {
457		rv = ENXIO;
458		goto done;
459	}
460
461	/* Add this to the end of the tailq. */
462	STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue);
463
464	/* Add references for all current listeners. */
465	refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners);
466
467	/*
468	 * If any listener is currently stuck on NULL, that means they are
469	 * waiting. Point their head to this new entry.
470	 */
471	wakeup_needed = false;
472	STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list)
473		if (priv->tldi_head == NULL) {
474			priv->tldi_head = entry;
475			wakeup_needed = true;
476		}
477
478	if (wakeup_needed) {
479		selwakeup(&tcp_log_sel);
480		wakeup(&tcp_log_dev_listeners);
481	}
482
483	rv = 0;
484
485done:
486	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
487	TCP_LOG_DEV_QUEUE_UNLOCK();
488	return (rv);
489}
490
491static int
492tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused)
493{
494
495	/* TODO: Support intelligent unloading. */
496	switch (type) {
497	case MOD_LOAD:
498		if (bootverbose)
499			printf("tcp_log: tcp_log device\n");
500		memset(&tcp_log_sel, 0, sizeof(tcp_log_sel));
501		memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx));
502		mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev",
503			 "tcp_log device queues", MTX_DEF);
504		tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
505		    &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400,
506		    "tcp_log");
507		break;
508	default:
509		return (EOPNOTSUPP);
510	}
511
512	return (0);
513}
514
515DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL);
516MODULE_VERSION(tcp_log_dev, 1);
517