1/*
2 * (C) 2006-2011 by Pablo Neira Ayuso <pablo@netfilter.org>
3 * (C) 2011 by Vyatta Inc. <http://www.vyatta.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20#include "netlink.h"
21#include "conntrackd.h"
22#include "filter.h"
23#include "log.h"
24
25#include <string.h>
26#include <errno.h>
27#include <sys/types.h>
28#include <sys/socket.h>
29#include <sys/fcntl.h>
30#include <libnetfilter_conntrack/libnetfilter_conntrack_tcp.h>
31
32struct nfct_handle *nl_init_event_handler(void)
33{
34	struct nfct_handle *h;
35
36	h = nfct_open(CONFIG(netlink).subsys_id, CONFIG(netlink).groups);
37	if (h == NULL)
38		return NULL;
39
40	if (CONFIG(netlink).events_reliable) {
41		int on = 1;
42
43		setsockopt(nfct_fd(h), SOL_NETLINK,
44			   NETLINK_BROADCAST_SEND_ERROR, &on, sizeof(int));
45
46		setsockopt(nfct_fd(h), SOL_NETLINK,
47			   NETLINK_NO_ENOBUFS, &on, sizeof(int));
48
49		dlog(LOG_NOTICE, "reliable ctnetlink event delivery "
50				 "is ENABLED.");
51	}
52
53	if (STATE(filter)) {
54		if (CONFIG(filter_from_kernelspace)) {
55			if (nfct_filter_attach(nfct_fd(h),
56					       STATE(filter)) == -1) {
57				dlog(LOG_ERR, "cannot set event filtering: %s",
58				     strerror(errno));
59			}
60			dlog(LOG_NOTICE, "using kernel-space event filtering");
61		} else
62			dlog(LOG_NOTICE, "using user-space event filtering");
63
64		nfct_filter_destroy(STATE(filter));
65	}
66
67	fcntl(nfct_fd(h), F_SETFL, O_NONBLOCK);
68
69	/* set up socket buffer size */
70	if (CONFIG(netlink_buffer_size) &&
71	    CONFIG(netlink_buffer_size) <=
72			CONFIG(netlink_buffer_size_max_grown)) {
73		/* we divide netlink_buffer_size by 2 here since value passed
74		   to kernel gets doubled in SO_RCVBUF; see net/core/sock.c */
75		CONFIG(netlink_buffer_size) =
76		  nfnl_rcvbufsiz(nfct_nfnlh(h), CONFIG(netlink_buffer_size)/2);
77	} else {
78		dlog(LOG_NOTICE, "NetlinkBufferSize is either not set or "
79				 "is greater than NetlinkBufferSizeMaxGrowth. "
80				 "Using current system buffer size");
81
82		socklen_t socklen = sizeof(unsigned int);
83		unsigned int read_size;
84
85		/* get current buffer size */
86		getsockopt(nfct_fd(h), SOL_SOCKET,
87			   SO_RCVBUF, &read_size, &socklen);
88
89		CONFIG(netlink_buffer_size) = read_size;
90	}
91
92	dlog(LOG_NOTICE, "netlink event socket buffer size has been set "
93			 "to %u bytes", CONFIG(netlink_buffer_size));
94
95	return h;
96}
97
98struct nlif_handle *nl_init_interface_handler(void)
99{
100	struct nlif_handle *h;
101	h = nlif_open();
102	if (h == NULL)
103		return NULL;
104
105	if (nlif_query(h) == -1) {
106		free(h);
107		return NULL;
108	}
109	fcntl(nlif_fd(h), F_SETFL, O_NONBLOCK);
110
111	return h;
112}
113
114static int warned = 0;
115
116void nl_resize_socket_buffer(struct nfct_handle *h)
117{
118	unsigned int s = CONFIG(netlink_buffer_size);
119
120	/* already warned that we have reached the maximum buffer size */
121	if (warned)
122		return;
123
124	/* since sock_setsockopt in net/core/sock.c doubles the size of socket
125	   buffer passed to it using nfnl_rcvbufsiz, only call nfnl_rcvbufsiz
126	   if new value is not greater than netlink_buffer_size_max_grown */
127	if (s*2 > CONFIG(netlink_buffer_size_max_grown)) {
128		dlog(LOG_WARNING,
129		     "netlink event socket buffer size cannot "
130		     "be doubled further since it will exceed "
131		     "NetlinkBufferSizeMaxGrowth. We are likely to "
132		     "be losing events, this may lead to "
133		     "unsynchronized replicas. Please, consider "
134		     "increasing netlink socket buffer size via "
135		     "NetlinkBufferSize and "
136		     "NetlinkBufferSizeMaxGrowth clauses in "
137		     "conntrackd.conf");
138		warned = 1;
139		return;
140	}
141
142	CONFIG(netlink_buffer_size) = nfnl_rcvbufsiz(nfct_nfnlh(h), s);
143
144	/* notify the sysadmin */
145	dlog(LOG_NOTICE, "netlink event socket buffer size has been doubled "
146			 "to %u bytes", CONFIG(netlink_buffer_size));
147}
148
149int nl_dump_conntrack_table(struct nfct_handle *h)
150{
151	return nfct_query(h, NFCT_Q_DUMP, &CONFIG(family));
152}
153
154static int
155nl_flush_selective_cb(enum nf_conntrack_msg_type type,
156		      struct nf_conntrack *ct, void *data)
157{
158	/* don't delete this conntrack, it's in the ignore filter */
159	if (ct_filter_conntrack(ct, 1))
160		return NFCT_CB_CONTINUE;
161
162	switch(type) {
163	case NFCT_T_UPDATE:
164		nl_destroy_conntrack(STATE(flush), ct);
165		break;
166	default:
167		STATE(stats).nl_dump_unknown_type++;
168		break;
169	}
170	return NFCT_CB_CONTINUE;
171}
172
173int nl_flush_conntrack_table_selective(void)
174{
175	struct nfct_handle *h;
176	int ret;
177
178	h = nfct_open(CONNTRACK, 0);
179	if (h == NULL) {
180		dlog(LOG_ERR, "cannot open handle");
181		return -1;
182	}
183	nfct_callback_register(h, NFCT_T_ALL, nl_flush_selective_cb, NULL);
184
185	ret = nfct_query(h, NFCT_Q_DUMP, &CONFIG(family));
186
187	nfct_close(h);
188
189	return ret;
190}
191
192int nl_send_resync(struct nfct_handle *h)
193{
194	int family = CONFIG(family);
195	return nfct_send(h, NFCT_Q_DUMP, &family);
196}
197
198/* if the handle has no callback, check for existence, otherwise, update */
199int nl_get_conntrack(struct nfct_handle *h, const struct nf_conntrack *ct)
200{
201	int ret = 1;
202	struct nf_conntrack *tmp;
203
204	tmp = nfct_new();
205	if (tmp == NULL)
206		return -1;
207
208	/* use the original tuple to check if it is there */
209	nfct_copy(tmp, ct, NFCT_CP_ORIG);
210
211	if (nfct_query(h, NFCT_Q_GET, tmp) == -1)
212		ret = (errno == ENOENT) ? 0 : -1;
213
214	nfct_destroy(tmp);
215	return ret;
216}
217
218int nl_create_conntrack(struct nfct_handle *h,
219			const struct nf_conntrack *orig,
220			int timeout)
221{
222	int ret;
223	struct nf_conntrack *ct;
224
225	ct = nfct_clone(orig);
226	if (ct == NULL)
227		return -1;
228
229	if (timeout > 0)
230		nfct_set_attr_u32(ct, ATTR_TIMEOUT, timeout);
231
232	/* we hit error if we try to change the expected bit */
233	if (nfct_attr_is_set(ct, ATTR_STATUS)) {
234		uint32_t status = nfct_get_attr_u32(ct, ATTR_STATUS);
235		status &= ~IPS_EXPECTED;
236		nfct_set_attr_u32(ct, ATTR_STATUS, status);
237	}
238
239	nfct_setobjopt(ct, NFCT_SOPT_SETUP_REPLY);
240
241	/* disable TCP window tracking for recovered connections if required */
242	if (nfct_attr_is_set(ct, ATTR_TCP_STATE)) {
243		uint8_t flags = IP_CT_TCP_FLAG_SACK_PERM;
244
245		if (!CONFIG(sync).tcp_window_tracking)
246			flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
247		else
248			flags |= IP_CT_TCP_FLAG_WINDOW_SCALE;
249
250		/* FIXME: workaround, we should send TCP flags in updates */
251		if (nfct_get_attr_u8(ct, ATTR_TCP_STATE) >=
252						TCP_CONNTRACK_TIME_WAIT) {
253			flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
254		}
255		nfct_set_attr_u8(ct, ATTR_TCP_FLAGS_ORIG, flags);
256		nfct_set_attr_u8(ct, ATTR_TCP_MASK_ORIG, flags);
257		nfct_set_attr_u8(ct, ATTR_TCP_FLAGS_REPL, flags);
258		nfct_set_attr_u8(ct, ATTR_TCP_MASK_REPL, flags);
259	}
260
261	ret = nfct_query(h, NFCT_Q_CREATE, ct);
262	nfct_destroy(ct);
263
264	return ret;
265}
266
267int nl_update_conntrack(struct nfct_handle *h,
268			const struct nf_conntrack *orig,
269			int timeout)
270{
271	int ret;
272	struct nf_conntrack *ct;
273
274	ct = nfct_clone(orig);
275	if (ct == NULL)
276		return -1;
277
278	if (timeout > 0)
279		nfct_set_attr_u32(ct, ATTR_TIMEOUT, timeout);
280
281	/* unset NAT info, otherwise we hit error */
282	nfct_attr_unset(ct, ATTR_SNAT_IPV4);
283	nfct_attr_unset(ct, ATTR_DNAT_IPV4);
284	nfct_attr_unset(ct, ATTR_SNAT_PORT);
285	nfct_attr_unset(ct, ATTR_DNAT_PORT);
286
287	if (nfct_attr_is_set(ct, ATTR_STATUS)) {
288		uint32_t status = nfct_get_attr_u32(ct, ATTR_STATUS);
289		status &= ~IPS_NAT_MASK;
290		nfct_set_attr_u32(ct, ATTR_STATUS, status);
291	}
292	/* we have to unset the helper to avoid EBUSY in reset timers */
293	if (nfct_attr_is_set(ct, ATTR_HELPER_NAME))
294		nfct_attr_unset(ct, ATTR_HELPER_NAME);
295
296	/* we hit error if we try to update the master conntrack */
297	if (ct_is_related(ct)) {
298		nfct_attr_unset(ct, ATTR_MASTER_L3PROTO);
299		nfct_attr_unset(ct, ATTR_MASTER_L4PROTO);
300		nfct_attr_unset(ct, ATTR_MASTER_IPV4_SRC);
301		nfct_attr_unset(ct, ATTR_MASTER_IPV4_DST);
302		nfct_attr_unset(ct, ATTR_MASTER_IPV6_SRC);
303		nfct_attr_unset(ct, ATTR_MASTER_IPV6_DST);
304		nfct_attr_unset(ct, ATTR_MASTER_PORT_SRC);
305		nfct_attr_unset(ct, ATTR_MASTER_PORT_DST);
306	}
307
308	/* disable TCP window tracking for recovered connections if required */
309	if (nfct_attr_is_set(ct, ATTR_TCP_STATE)) {
310		uint8_t flags = IP_CT_TCP_FLAG_SACK_PERM;
311
312		if (!CONFIG(sync).tcp_window_tracking)
313			flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
314		else
315			flags |= IP_CT_TCP_FLAG_WINDOW_SCALE;
316
317		/* FIXME: workaround, we should send TCP flags in updates */
318		if (nfct_get_attr_u8(ct, ATTR_TCP_STATE) >=
319						TCP_CONNTRACK_TIME_WAIT) {
320			flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
321		}
322		nfct_set_attr_u8(ct, ATTR_TCP_FLAGS_ORIG, flags);
323		nfct_set_attr_u8(ct, ATTR_TCP_MASK_ORIG, flags);
324		nfct_set_attr_u8(ct, ATTR_TCP_FLAGS_REPL, flags);
325		nfct_set_attr_u8(ct, ATTR_TCP_MASK_REPL, flags);
326	}
327
328	ret = nfct_query(h, NFCT_Q_UPDATE, ct);
329	nfct_destroy(ct);
330
331	return ret;
332}
333
334int nl_destroy_conntrack(struct nfct_handle *h, const struct nf_conntrack *ct)
335{
336	return nfct_query(h, NFCT_Q_DESTROY, ct);
337}
338
339int nl_create_expect(struct nfct_handle *h, const struct nf_expect *orig,
340		     int timeout)
341{
342	int ret;
343	struct nf_expect *exp;
344
345	exp = nfexp_clone(orig);
346	if (exp == NULL)
347		return -1;
348
349	if (timeout > 0)
350		nfexp_set_attr_u32(exp, ATTR_EXP_TIMEOUT, timeout);
351
352	ret = nfexp_query(h, NFCT_Q_CREATE, exp);
353	nfexp_destroy(exp);
354
355	return ret;
356}
357
358int nl_destroy_expect(struct nfct_handle *h, const struct nf_expect *exp)
359{
360	return nfexp_query(h, NFCT_Q_DESTROY, exp);
361}
362
363/* if the handle has no callback, check for existence, otherwise, update */
364int nl_get_expect(struct nfct_handle *h, const struct nf_expect *exp)
365{
366	int ret = 1;
367	struct nf_expect *tmp;
368
369	/* XXX: we only need the expectation, not the mask and the master. */
370	tmp = nfexp_clone(exp);
371	if (tmp == NULL)
372		return -1;
373
374	if (nfexp_query(h, NFCT_Q_GET, tmp) == -1)
375		ret = (errno == ENOENT) ? 0 : -1;
376
377	nfexp_destroy(tmp);
378	return ret;
379}
380
381int nl_dump_expect_table(struct nfct_handle *h)
382{
383	return nfexp_query(h, NFCT_Q_DUMP, &CONFIG(family));
384}
385
386int nl_flush_expect_table(struct nfct_handle *h)
387{
388	return nfexp_query(h, NFCT_Q_FLUSH, &CONFIG(family));
389}
390
391int nl_send_expect_resync(struct nfct_handle *h)
392{
393	int family = CONFIG(family);
394	return nfexp_send(h, NFCT_Q_DUMP, &family);
395}
396