1/*
2 * This is a module which is used for queueing IPv6 packets and
3 * communicating with userspace via netlink.
4 *
5 * (C) 2001 Fernando Anton, this code is GPL.
6 *     IPv64 Project - Work based in IPv64 draft by Arturo Azcorra.
7 *     Universidad Carlos III de Madrid - Leganes (Madrid) - Spain
8 *     Universidad Politecnica de Alcala de Henares - Alcala de H. (Madrid) - Spain
9 *     email: fanton@it.uc3m.es
10 *
11 * 2001-11-06: First try. Working with ip_queue.c for IPv4 and trying
12 *             to adapt it to IPv6
13 *             HEAVILY based in ipqueue.c by James Morris. It's just
14 *             a little modified version of it, so he's nearly the
15 *             real coder of this.
16 *             Few changes needed, mainly the hard_routing code and
17 *             the netlink socket protocol (we're NETLINK_IP6_FW).
18 * 2002-06-25: Code cleanup. [JM: ported cleanup over from ip_queue.c]
19 */
20#include <linux/module.h>
21#include <linux/skbuff.h>
22#include <linux/init.h>
23#include <linux/ipv6.h>
24#include <linux/notifier.h>
25#include <linux/netdevice.h>
26#include <linux/netfilter.h>
27#include <linux/netlink.h>
28#include <linux/spinlock.h>
29#include <linux/brlock.h>
30#include <linux/sysctl.h>
31#include <linux/proc_fs.h>
32#include <net/sock.h>
33#include <net/ipv6.h>
34#include <net/ip6_route.h>
35#include <linux/netfilter_ipv4/ip_queue.h>
36#include <linux/netfilter_ipv4/ip_tables.h>
37#include <linux/netfilter_ipv6/ip6_tables.h>
38
39#define IPQ_QMAX_DEFAULT 1024
40#define IPQ_PROC_FS_NAME "ip6_queue"
41#define NET_IPQ_QMAX 2088
42#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen"
43
44struct ipq_rt_info {
45	struct in6_addr daddr;
46	struct in6_addr saddr;
47};
48
49struct ipq_queue_entry {
50	struct list_head list;
51	struct nf_info *info;
52	struct sk_buff *skb;
53	struct ipq_rt_info rt_info;
54};
55
56typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
57
58static unsigned char copy_mode = IPQ_COPY_NONE;
59static unsigned int queue_maxlen = IPQ_QMAX_DEFAULT;
60static rwlock_t queue_lock = RW_LOCK_UNLOCKED;
61static int peer_pid;
62static unsigned int copy_range;
63static unsigned int queue_total;
64static struct sock *ipqnl;
65static LIST_HEAD(queue_list);
66static DECLARE_MUTEX(ipqnl_sem);
67
68static void
69ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
70{
71	nf_reinject(entry->skb, entry->info, verdict);
72	kfree(entry);
73}
74
75static inline int
76__ipq_enqueue_entry(struct ipq_queue_entry *entry)
77{
78       if (queue_total >= queue_maxlen) {
79               if (net_ratelimit())
80                       printk(KERN_WARNING "ip6_queue: full at %d entries, "
81                              "dropping packet(s).\n", queue_total);
82               return -ENOSPC;
83       }
84       list_add(&entry->list, &queue_list);
85       queue_total++;
86       return 0;
87}
88
89/*
90 * Find and return a queued entry matched by cmpfn, or return the last
91 * entry if cmpfn is NULL.
92 */
93static inline struct ipq_queue_entry *
94__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data)
95{
96	struct list_head *p;
97
98	list_for_each_prev(p, &queue_list) {
99		struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p;
100
101		if (!cmpfn || cmpfn(entry, data))
102			return entry;
103	}
104	return NULL;
105}
106
107static inline void
108__ipq_dequeue_entry(struct ipq_queue_entry *entry)
109{
110	list_del(&entry->list);
111	queue_total--;
112}
113
114static inline struct ipq_queue_entry *
115__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
116{
117	struct ipq_queue_entry *entry;
118
119	entry = __ipq_find_entry(cmpfn, data);
120	if (entry == NULL)
121		return NULL;
122
123	__ipq_dequeue_entry(entry);
124	return entry;
125}
126
127
128static inline void
129__ipq_flush(int verdict)
130{
131	struct ipq_queue_entry *entry;
132
133	while ((entry = __ipq_find_dequeue_entry(NULL, 0)))
134		ipq_issue_verdict(entry, verdict);
135}
136
137static inline int
138__ipq_set_mode(unsigned char mode, unsigned int range)
139{
140	int status = 0;
141
142	switch(mode) {
143	case IPQ_COPY_NONE:
144	case IPQ_COPY_META:
145		copy_mode = mode;
146		copy_range = 0;
147		break;
148
149	case IPQ_COPY_PACKET:
150		copy_mode = mode;
151		copy_range = range;
152		if (copy_range > 0xFFFF)
153			copy_range = 0xFFFF;
154		break;
155
156	default:
157		status = -EINVAL;
158
159	}
160	return status;
161}
162
163static inline void
164__ipq_reset(void)
165{
166	peer_pid = 0;
167	__ipq_set_mode(IPQ_COPY_NONE, 0);
168	__ipq_flush(NF_DROP);
169}
170
171static struct ipq_queue_entry *
172ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
173{
174	struct ipq_queue_entry *entry;
175
176	write_lock_bh(&queue_lock);
177	entry = __ipq_find_dequeue_entry(cmpfn, data);
178	write_unlock_bh(&queue_lock);
179	return entry;
180}
181
182static void
183ipq_flush(int verdict)
184{
185	write_lock_bh(&queue_lock);
186	__ipq_flush(verdict);
187	write_unlock_bh(&queue_lock);
188}
189
190static struct sk_buff *
191ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
192{
193	unsigned char *old_tail;
194	size_t size = 0;
195	size_t data_len = 0;
196	struct sk_buff *skb;
197	struct ipq_packet_msg *pmsg;
198	struct nlmsghdr *nlh;
199
200	read_lock_bh(&queue_lock);
201
202	switch (copy_mode) {
203	case IPQ_COPY_META:
204	case IPQ_COPY_NONE:
205		size = NLMSG_SPACE(sizeof(*pmsg));
206		data_len = 0;
207		break;
208
209	case IPQ_COPY_PACKET:
210		if (copy_range == 0 || copy_range > entry->skb->len)
211			data_len = entry->skb->len;
212		else
213			data_len = copy_range;
214
215		size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
216		break;
217
218	default:
219		*errp = -EINVAL;
220		read_unlock_bh(&queue_lock);
221		return NULL;
222	}
223
224	read_unlock_bh(&queue_lock);
225
226	skb = alloc_skb(size, GFP_ATOMIC);
227	if (!skb)
228		goto nlmsg_failure;
229
230	old_tail= skb->tail;
231	nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
232	pmsg = NLMSG_DATA(nlh);
233	memset(pmsg, 0, sizeof(*pmsg));
234
235	pmsg->packet_id       = (unsigned long )entry;
236	pmsg->data_len        = data_len;
237	pmsg->timestamp_sec   = entry->skb->stamp.tv_sec;
238	pmsg->timestamp_usec  = entry->skb->stamp.tv_usec;
239	pmsg->mark            = entry->skb->nfmark;
240	pmsg->hook            = entry->info->hook;
241	pmsg->hw_protocol     = entry->skb->protocol;
242
243	if (entry->info->indev)
244		strcpy(pmsg->indev_name, entry->info->indev->name);
245	else
246		pmsg->indev_name[0] = '\0';
247
248	if (entry->info->outdev)
249		strcpy(pmsg->outdev_name, entry->info->outdev->name);
250	else
251		pmsg->outdev_name[0] = '\0';
252
253	if (entry->info->indev && entry->skb->dev) {
254		pmsg->hw_type = entry->skb->dev->type;
255		if (entry->skb->dev->hard_header_parse)
256			pmsg->hw_addrlen =
257				entry->skb->dev->hard_header_parse(entry->skb,
258				                                   pmsg->hw_addr);
259	}
260
261	if (data_len)
262		memcpy(pmsg->payload, entry->skb->data, data_len);
263
264	nlh->nlmsg_len = skb->tail - old_tail;
265	return skb;
266
267nlmsg_failure:
268	if (skb)
269		kfree_skb(skb);
270	*errp = -EINVAL;
271	printk(KERN_ERR "ip6_queue: error creating packet message\n");
272	return NULL;
273}
274
275static int
276ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
277{
278	int status = -EINVAL;
279	struct sk_buff *nskb;
280	struct ipq_queue_entry *entry;
281
282	if (copy_mode == IPQ_COPY_NONE)
283		return -EAGAIN;
284
285	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
286	if (entry == NULL) {
287		printk(KERN_ERR "ip6_queue: OOM in ipq_enqueue_packet()\n");
288		return -ENOMEM;
289	}
290
291	entry->info = info;
292	entry->skb = skb;
293
294	if (entry->info->hook == NF_IP_LOCAL_OUT) {
295		struct ipv6hdr *iph = skb->nh.ipv6h;
296
297		entry->rt_info.daddr = iph->daddr;
298		entry->rt_info.saddr = iph->saddr;
299	}
300
301	nskb = ipq_build_packet_message(entry, &status);
302	if (nskb == NULL)
303		goto err_out_free;
304
305	write_lock_bh(&queue_lock);
306
307	if (!peer_pid)
308		goto err_out_unlock;
309
310	status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
311	if (status < 0)
312		goto err_out_unlock;
313
314	status = __ipq_enqueue_entry(entry);
315	if (status < 0)
316		goto err_out_unlock;
317
318	write_unlock_bh(&queue_lock);
319	return status;
320
321err_out_unlock:
322	write_unlock_bh(&queue_lock);
323
324err_out_free:
325	kfree(entry);
326	return status;
327}
328
329/*
330 * Taken from net/ipv6/ip6_output.c
331 *
332 * We should use the one there, but is defined static
333 * so we put this just here and let the things as
334 * they are now.
335 *
336 * If that one is modified, this one should be modified too.
337 */
338static int
339route6_me_harder(struct sk_buff *skb)
340{
341	struct ipv6hdr *iph = skb->nh.ipv6h;
342	struct dst_entry *dst;
343	struct flowi fl;
344
345	fl.proto = iph->nexthdr;
346	fl.fl6_dst = &iph->daddr;
347	fl.fl6_src = &iph->saddr;
348	fl.oif = skb->sk ? skb->sk->bound_dev_if : 0;
349	fl.fl6_flowlabel = 0;
350	fl.uli_u.ports.dport = 0;
351	fl.uli_u.ports.sport = 0;
352
353	dst = ip6_route_output(skb->sk, &fl);
354
355	if (dst->error) {
356		if (net_ratelimit())
357			printk(KERN_DEBUG "route6_me_harder: No more route.\n");
358		return -EINVAL;
359	}
360
361	/* Drop old route. */
362	dst_release(skb->dst);
363
364	skb->dst = dst;
365	return 0;
366}
367
368static int
369ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
370{
371	int diff;
372	struct ipv6hdr *user_iph = (struct ipv6hdr *)v->payload;
373
374	if (v->data_len < sizeof(*user_iph))
375		return 0;
376	diff = v->data_len - e->skb->len;
377	if (diff < 0)
378		skb_trim(e->skb, v->data_len);
379	else if (diff > 0) {
380		if (v->data_len > 0xFFFF)
381			return -EINVAL;
382		if (diff > skb_tailroom(e->skb)) {
383			struct sk_buff *newskb;
384
385			newskb = skb_copy_expand(e->skb,
386			                         skb_headroom(e->skb),
387			                         diff,
388			                         GFP_ATOMIC);
389			if (newskb == NULL) {
390				printk(KERN_WARNING "ip6_queue: OOM "
391				      "in mangle, dropping packet\n");
392				return -ENOMEM;
393			}
394			if (e->skb->sk)
395				skb_set_owner_w(newskb, e->skb->sk);
396			kfree_skb(e->skb);
397			e->skb = newskb;
398		}
399		skb_put(e->skb, diff);
400	}
401	memcpy(e->skb->data, v->payload, v->data_len);
402	e->skb->nfcache |= NFC_ALTERED;
403
404	/*
405	 * Extra routing may needed on local out, as the QUEUE target never
406	 * returns control to the table.
407         * Not a nice way to cmp, but works
408	 */
409	if (e->info->hook == NF_IP_LOCAL_OUT) {
410		struct ipv6hdr *iph = e->skb->nh.ipv6h;
411		if (ipv6_addr_cmp(&iph->daddr, &e->rt_info.daddr) ||
412		    ipv6_addr_cmp(&iph->saddr, &e->rt_info.saddr))
413			return route6_me_harder(e->skb);
414	}
415	return 0;
416}
417
418static inline int
419id_cmp(struct ipq_queue_entry *e, unsigned long id)
420{
421	return (id == (unsigned long )e);
422}
423
424static int
425ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
426{
427	struct ipq_queue_entry *entry;
428
429	if (vmsg->value > NF_MAX_VERDICT)
430		return -EINVAL;
431
432	entry = ipq_find_dequeue_entry(id_cmp, vmsg->id);
433	if (entry == NULL)
434		return -ENOENT;
435	else {
436		int verdict = vmsg->value;
437
438		if (vmsg->data_len && vmsg->data_len == len)
439			if (ipq_mangle_ipv6(vmsg, entry) < 0)
440				verdict = NF_DROP;
441
442		ipq_issue_verdict(entry, verdict);
443		return 0;
444	}
445}
446
447static int
448ipq_set_mode(unsigned char mode, unsigned int range)
449{
450	int status;
451
452	write_lock_bh(&queue_lock);
453	status = __ipq_set_mode(mode, range);
454	write_unlock_bh(&queue_lock);
455	return status;
456}
457
458static int
459ipq_receive_peer(struct ipq_peer_msg *pmsg,
460                 unsigned char type, unsigned int len)
461{
462	int status = 0;
463
464	if (len < sizeof(*pmsg))
465		return -EINVAL;
466
467	switch (type) {
468	case IPQM_MODE:
469		status = ipq_set_mode(pmsg->msg.mode.value,
470		                      pmsg->msg.mode.range);
471		break;
472
473	case IPQM_VERDICT:
474		if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
475			status = -EINVAL;
476		else
477			status = ipq_set_verdict(&pmsg->msg.verdict,
478			                         len - sizeof(*pmsg));
479			break;
480	default:
481		status = -EINVAL;
482	}
483	return status;
484}
485
486static int
487dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex)
488{
489	if (entry->info->indev)
490		if (entry->info->indev->ifindex == ifindex)
491			return 1;
492
493	if (entry->info->outdev)
494		if (entry->info->outdev->ifindex == ifindex)
495			return 1;
496
497	return 0;
498}
499
500static void
501ipq_dev_drop(int ifindex)
502{
503	struct ipq_queue_entry *entry;
504
505	while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL)
506		ipq_issue_verdict(entry, NF_DROP);
507}
508
509#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
510
511static inline void
512ipq_rcv_skb(struct sk_buff *skb)
513{
514	int status, type, pid, flags, nlmsglen, skblen;
515	struct nlmsghdr *nlh;
516
517	skblen = skb->len;
518	if (skblen < sizeof(*nlh))
519		return;
520
521	nlh = (struct nlmsghdr *)skb->data;
522	nlmsglen = nlh->nlmsg_len;
523	if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
524		return;
525
526	pid = nlh->nlmsg_pid;
527	flags = nlh->nlmsg_flags;
528
529	if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
530		RCV_SKB_FAIL(-EINVAL);
531
532	if (flags & MSG_TRUNC)
533		RCV_SKB_FAIL(-ECOMM);
534
535	type = nlh->nlmsg_type;
536	if (type < NLMSG_NOOP || type >= IPQM_MAX)
537		RCV_SKB_FAIL(-EINVAL);
538
539	if (type <= IPQM_BASE)
540		return;
541
542	if(!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN))
543		RCV_SKB_FAIL(-EPERM);
544
545	write_lock_bh(&queue_lock);
546
547	if (peer_pid) {
548		if (peer_pid != pid) {
549			write_unlock_bh(&queue_lock);
550			RCV_SKB_FAIL(-EBUSY);
551		}
552	}
553	else
554		peer_pid = pid;
555
556	write_unlock_bh(&queue_lock);
557
558	status = ipq_receive_peer(NLMSG_DATA(nlh), type,
559	                          skblen - NLMSG_LENGTH(0));
560	if (status < 0)
561		RCV_SKB_FAIL(status);
562
563	if (flags & NLM_F_ACK)
564		netlink_ack(skb, nlh, 0);
565        return;
566}
567
568static void
569ipq_rcv_sk(struct sock *sk, int len)
570{
571	do {
572		struct sk_buff *skb;
573
574		if (down_trylock(&ipqnl_sem))
575			return;
576
577		while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
578			ipq_rcv_skb(skb);
579			kfree_skb(skb);
580		}
581
582		up(&ipqnl_sem);
583
584	} while (ipqnl && ipqnl->receive_queue.qlen);
585}
586
587static int
588ipq_rcv_dev_event(struct notifier_block *this,
589                  unsigned long event, void *ptr)
590{
591	struct net_device *dev = ptr;
592
593	/* Drop any packets associated with the downed device */
594	if (event == NETDEV_DOWN)
595		ipq_dev_drop(dev->ifindex);
596	return NOTIFY_DONE;
597}
598
599static struct notifier_block ipq_dev_notifier = {
600	ipq_rcv_dev_event,
601	NULL,
602	0
603};
604
605static int
606ipq_rcv_nl_event(struct notifier_block *this,
607                 unsigned long event, void *ptr)
608{
609	struct netlink_notify *n = ptr;
610
611	if (event == NETLINK_URELEASE &&
612	    n->protocol == NETLINK_IP6_FW && n->pid) {
613		write_lock_bh(&queue_lock);
614		if (n->pid == peer_pid)
615			__ipq_reset();
616		write_unlock_bh(&queue_lock);
617	}
618	return NOTIFY_DONE;
619}
620
621static struct notifier_block ipq_nl_notifier = {
622	ipq_rcv_nl_event,
623	NULL,
624	0
625};
626
627static int sysctl_maxlen = IPQ_QMAX_DEFAULT;
628static struct ctl_table_header *ipq_sysctl_header;
629
630static ctl_table ipq_table[] = {
631	{ NET_IPQ_QMAX, NET_IPQ_QMAX_NAME, &sysctl_maxlen,
632	  sizeof(sysctl_maxlen), 0644,  NULL, proc_dointvec },
633 	{ 0 }
634};
635
636static ctl_table ipq_dir_table[] = {
637	{NET_IPV6, "ipv6", NULL, 0, 0555, ipq_table, 0, 0, 0, 0, 0},
638	{ 0 }
639};
640
641static ctl_table ipq_root_table[] = {
642	{CTL_NET, "net", NULL, 0, 0555, ipq_dir_table, 0, 0, 0, 0, 0},
643	{ 0 }
644};
645
646static int
647ipq_get_info(char *buffer, char **start, off_t offset, int length)
648{
649	int len;
650
651	read_lock_bh(&queue_lock);
652
653	len = sprintf(buffer,
654	              "Peer PID          : %d\n"
655	              "Copy mode         : %hu\n"
656	              "Copy range        : %u\n"
657	              "Queue length      : %u\n"
658	              "Queue max. length : %u\n",
659	              peer_pid,
660	              copy_mode,
661	              copy_range,
662	              queue_total,
663	              queue_maxlen);
664
665	read_unlock_bh(&queue_lock);
666
667	*start = buffer + offset;
668	len -= offset;
669	if (len > length)
670		len = length;
671	else if (len < 0)
672		len = 0;
673	return len;
674}
675
676static int
677init_or_cleanup(int init)
678{
679	int status = -ENOMEM;
680	struct proc_dir_entry *proc;
681
682	if (!init)
683		goto cleanup;
684
685	netlink_register_notifier(&ipq_nl_notifier);
686	ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk);
687	if (ipqnl == NULL) {
688		printk(KERN_ERR "ip6_queue: failed to create netlink socket\n");
689		goto cleanup_netlink_notifier;
690	}
691
692	proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info);
693	if (proc)
694		proc->owner = THIS_MODULE;
695	else {
696		printk(KERN_ERR "ip6_queue: failed to create proc entry\n");
697		goto cleanup_ipqnl;
698	}
699
700	register_netdevice_notifier(&ipq_dev_notifier);
701	ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
702
703	status = nf_register_queue_handler(PF_INET6, ipq_enqueue_packet, NULL);
704	if (status < 0) {
705		printk(KERN_ERR "ip6_queue: failed to register queue handler\n");
706		goto cleanup_sysctl;
707	}
708	return status;
709
710cleanup:
711	nf_unregister_queue_handler(PF_INET6);
712	br_write_lock_bh(BR_NETPROTO_LOCK);
713	br_write_unlock_bh(BR_NETPROTO_LOCK);
714	ipq_flush(NF_DROP);
715
716cleanup_sysctl:
717	unregister_sysctl_table(ipq_sysctl_header);
718	unregister_netdevice_notifier(&ipq_dev_notifier);
719	proc_net_remove(IPQ_PROC_FS_NAME);
720
721cleanup_ipqnl:
722	sock_release(ipqnl->socket);
723	down(&ipqnl_sem);
724	up(&ipqnl_sem);
725
726cleanup_netlink_notifier:
727	netlink_unregister_notifier(&ipq_nl_notifier);
728	return status;
729}
730
731static int __init init(void)
732{
733
734	return init_or_cleanup(1);
735}
736
737static void __exit fini(void)
738{
739	init_or_cleanup(0);
740}
741
742MODULE_DESCRIPTION("IPv6 packet queue handler");
743MODULE_LICENSE("GPL");
744
745module_init(init);
746module_exit(fini);
747