1/*
2 * This is a module which is used for queueing IPv4 packets and
3 * communicating with userspace via netlink.
4 *
5 * (C) 2000-2002 James Morris, this code is GPL.
6 *
7 * 2000-03-27: Simplified code (thanks to Andi Kleen for clues).
8 * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report).
9 * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian
10 *             Zander).
11 * 2000-08-01: Added Nick Williams' MAC support.
12 * 2002-06-25: Code cleanup.
13 *
14 */
15#include <linux/module.h>
16#include <linux/skbuff.h>
17#include <linux/init.h>
18#include <linux/ip.h>
19#include <linux/notifier.h>
20#include <linux/netdevice.h>
21#include <linux/netfilter.h>
22#include <linux/netfilter_ipv4/ip_queue.h>
23#include <linux/netfilter_ipv4/ip_tables.h>
24#include <linux/netlink.h>
25#include <linux/spinlock.h>
26#include <linux/brlock.h>
27#include <linux/sysctl.h>
28#include <linux/proc_fs.h>
29#include <net/sock.h>
30#include <net/route.h>
31
32#define IPQ_QMAX_DEFAULT 1024
33#define IPQ_PROC_FS_NAME "ip_queue"
34#define NET_IPQ_QMAX 2088
35#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
36
37struct ipq_rt_info {
38	__u8 tos;
39	__u32 daddr;
40	__u32 saddr;
41};
42
43struct ipq_queue_entry {
44	struct list_head list;
45	struct nf_info *info;
46	struct sk_buff *skb;
47	struct ipq_rt_info rt_info;
48};
49
50typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
51
52static unsigned char copy_mode = IPQ_COPY_NONE;
53static unsigned int queue_maxlen = IPQ_QMAX_DEFAULT;
54static rwlock_t queue_lock = RW_LOCK_UNLOCKED;
55static int peer_pid;
56static unsigned int copy_range;
57static unsigned int queue_total;
58static struct sock *ipqnl;
59static LIST_HEAD(queue_list);
60static DECLARE_MUTEX(ipqnl_sem);
61
62static void
63ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
64{
65	nf_reinject(entry->skb, entry->info, verdict);
66	kfree(entry);
67}
68
69static inline int
70__ipq_enqueue_entry(struct ipq_queue_entry *entry)
71{
72       if (queue_total >= queue_maxlen) {
73               if (net_ratelimit())
74                       printk(KERN_WARNING "ip_queue: full at %d entries, "
75                              "dropping packet(s).\n", queue_total);
76               return -ENOSPC;
77       }
78       list_add(&entry->list, &queue_list);
79       queue_total++;
80       return 0;
81}
82
83/*
84 * Find and return a queued entry matched by cmpfn, or return the last
85 * entry if cmpfn is NULL.
86 */
87static inline struct ipq_queue_entry *
88__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data)
89{
90	struct list_head *p;
91
92	list_for_each_prev(p, &queue_list) {
93		struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p;
94
95		if (!cmpfn || cmpfn(entry, data))
96			return entry;
97	}
98	return NULL;
99}
100
101static inline void
102__ipq_dequeue_entry(struct ipq_queue_entry *entry)
103{
104	list_del(&entry->list);
105	queue_total--;
106}
107
108static inline struct ipq_queue_entry *
109__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
110{
111	struct ipq_queue_entry *entry;
112
113	entry = __ipq_find_entry(cmpfn, data);
114	if (entry == NULL)
115		return NULL;
116
117	__ipq_dequeue_entry(entry);
118	return entry;
119}
120
121
122static inline void
123__ipq_flush(int verdict)
124{
125	struct ipq_queue_entry *entry;
126
127	while ((entry = __ipq_find_dequeue_entry(NULL, 0)))
128		ipq_issue_verdict(entry, verdict);
129}
130
131static inline int
132__ipq_set_mode(unsigned char mode, unsigned int range)
133{
134	int status = 0;
135
136	switch(mode) {
137	case IPQ_COPY_NONE:
138	case IPQ_COPY_META:
139		copy_mode = mode;
140		copy_range = 0;
141		break;
142
143	case IPQ_COPY_PACKET:
144		copy_mode = mode;
145		copy_range = range;
146		if (copy_range > 0xFFFF)
147			copy_range = 0xFFFF;
148		break;
149
150	default:
151		status = -EINVAL;
152
153	}
154	return status;
155}
156
157static inline void
158__ipq_reset(void)
159{
160	peer_pid = 0;
161	__ipq_set_mode(IPQ_COPY_NONE, 0);
162	__ipq_flush(NF_DROP);
163}
164
165static struct ipq_queue_entry *
166ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
167{
168	struct ipq_queue_entry *entry;
169
170	write_lock_bh(&queue_lock);
171	entry = __ipq_find_dequeue_entry(cmpfn, data);
172	write_unlock_bh(&queue_lock);
173	return entry;
174}
175
176static void
177ipq_flush(int verdict)
178{
179	write_lock_bh(&queue_lock);
180	__ipq_flush(verdict);
181	write_unlock_bh(&queue_lock);
182}
183
184static struct sk_buff *
185ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
186{
187	unsigned char *old_tail;
188	size_t size = 0;
189	size_t data_len = 0;
190	struct sk_buff *skb;
191	struct ipq_packet_msg *pmsg;
192	struct nlmsghdr *nlh;
193
194	read_lock_bh(&queue_lock);
195
196	switch (copy_mode) {
197	case IPQ_COPY_META:
198	case IPQ_COPY_NONE:
199		size = NLMSG_SPACE(sizeof(*pmsg));
200		data_len = 0;
201		break;
202
203	case IPQ_COPY_PACKET:
204		if (copy_range == 0 || copy_range > entry->skb->len)
205			data_len = entry->skb->len;
206		else
207			data_len = copy_range;
208
209		size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
210		break;
211
212	default:
213		*errp = -EINVAL;
214		read_unlock_bh(&queue_lock);
215		return NULL;
216	}
217
218	read_unlock_bh(&queue_lock);
219
220	skb = alloc_skb(size, GFP_ATOMIC);
221	if (!skb)
222		goto nlmsg_failure;
223
224	old_tail= skb->tail;
225	nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
226	pmsg = NLMSG_DATA(nlh);
227	memset(pmsg, 0, sizeof(*pmsg));
228
229	pmsg->packet_id       = (unsigned long )entry;
230	pmsg->data_len        = data_len;
231	pmsg->timestamp_sec   = entry->skb->stamp.tv_sec;
232	pmsg->timestamp_usec  = entry->skb->stamp.tv_usec;
233	pmsg->mark            = entry->skb->nfmark;
234	pmsg->hook            = entry->info->hook;
235	pmsg->hw_protocol     = entry->skb->protocol;
236
237	if (entry->info->indev)
238		strcpy(pmsg->indev_name, entry->info->indev->name);
239	else
240		pmsg->indev_name[0] = '\0';
241
242	if (entry->info->outdev)
243		strcpy(pmsg->outdev_name, entry->info->outdev->name);
244	else
245		pmsg->outdev_name[0] = '\0';
246
247	if (entry->info->indev && entry->skb->dev) {
248		pmsg->hw_type = entry->skb->dev->type;
249		if (entry->skb->dev->hard_header_parse)
250			pmsg->hw_addrlen =
251				entry->skb->dev->hard_header_parse(entry->skb,
252				                                   pmsg->hw_addr);
253	}
254
255	if (data_len)
256		memcpy(pmsg->payload, entry->skb->data, data_len);
257
258	nlh->nlmsg_len = skb->tail - old_tail;
259	return skb;
260
261nlmsg_failure:
262	if (skb)
263		kfree_skb(skb);
264	*errp = -EINVAL;
265	printk(KERN_ERR "ip_queue: error creating packet message\n");
266	return NULL;
267}
268
269static int
270ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
271{
272	int status = -EINVAL;
273	struct sk_buff *nskb;
274	struct ipq_queue_entry *entry;
275
276	if (copy_mode == IPQ_COPY_NONE)
277		return -EAGAIN;
278
279	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
280	if (entry == NULL) {
281		printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n");
282		return -ENOMEM;
283	}
284
285	entry->info = info;
286	entry->skb = skb;
287
288	if (entry->info->hook == NF_IP_LOCAL_OUT) {
289		struct iphdr *iph = skb->nh.iph;
290
291		entry->rt_info.tos = iph->tos;
292		entry->rt_info.daddr = iph->daddr;
293		entry->rt_info.saddr = iph->saddr;
294	}
295
296	nskb = ipq_build_packet_message(entry, &status);
297	if (nskb == NULL)
298		goto err_out_free;
299
300	write_lock_bh(&queue_lock);
301
302	if (!peer_pid)
303		goto err_out_unlock;
304
305	status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
306	if (status < 0)
307		goto err_out_unlock;
308
309	status = __ipq_enqueue_entry(entry);
310	if (status < 0)
311		goto err_out_unlock;
312
313	write_unlock_bh(&queue_lock);
314	return status;
315
316err_out_unlock:
317	write_unlock_bh(&queue_lock);
318
319err_out_free:
320	kfree(entry);
321	return status;
322}
323
324static int
325ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
326{
327	int diff;
328	struct iphdr *user_iph = (struct iphdr *)v->payload;
329
330	if (v->data_len < sizeof(*user_iph))
331		return 0;
332	diff = v->data_len - e->skb->len;
333	if (diff < 0)
334		skb_trim(e->skb, v->data_len);
335	else if (diff > 0) {
336		if (v->data_len > 0xFFFF)
337			return -EINVAL;
338		if (diff > skb_tailroom(e->skb)) {
339			struct sk_buff *newskb;
340
341			newskb = skb_copy_expand(e->skb,
342			                         skb_headroom(e->skb),
343			                         diff,
344			                         GFP_ATOMIC);
345			if (newskb == NULL) {
346				printk(KERN_WARNING "ip_queue: OOM "
347				      "in mangle, dropping packet\n");
348				return -ENOMEM;
349			}
350			if (e->skb->sk)
351				skb_set_owner_w(newskb, e->skb->sk);
352			kfree_skb(e->skb);
353			e->skb = newskb;
354		}
355		skb_put(e->skb, diff);
356	}
357	memcpy(e->skb->data, v->payload, v->data_len);
358	e->skb->nfcache |= NFC_ALTERED;
359
360	/*
361	 * Extra routing may needed on local out, as the QUEUE target never
362	 * returns control to the table.
363	 */
364	if (e->info->hook == NF_IP_LOCAL_OUT) {
365		struct iphdr *iph = e->skb->nh.iph;
366
367		if (!(iph->tos == e->rt_info.tos
368		      && iph->daddr == e->rt_info.daddr
369		      && iph->saddr == e->rt_info.saddr))
370			return ip_route_me_harder(&e->skb);
371	}
372	return 0;
373}
374
375static inline int
376id_cmp(struct ipq_queue_entry *e, unsigned long id)
377{
378	return (id == (unsigned long )e);
379}
380
381static int
382ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
383{
384	struct ipq_queue_entry *entry;
385
386	if (vmsg->value > NF_MAX_VERDICT)
387		return -EINVAL;
388
389	entry = ipq_find_dequeue_entry(id_cmp, vmsg->id);
390	if (entry == NULL)
391		return -ENOENT;
392	else {
393		int verdict = vmsg->value;
394
395		if (vmsg->data_len && vmsg->data_len == len)
396			if (ipq_mangle_ipv4(vmsg, entry) < 0)
397				verdict = NF_DROP;
398
399		ipq_issue_verdict(entry, verdict);
400		return 0;
401	}
402}
403
404static int
405ipq_set_mode(unsigned char mode, unsigned int range)
406{
407	int status;
408
409	write_lock_bh(&queue_lock);
410	status = __ipq_set_mode(mode, range);
411	write_unlock_bh(&queue_lock);
412	return status;
413}
414
415static int
416ipq_receive_peer(struct ipq_peer_msg *pmsg,
417                 unsigned char type, unsigned int len)
418{
419	int status = 0;
420
421	if (len < sizeof(*pmsg))
422		return -EINVAL;
423
424	switch (type) {
425	case IPQM_MODE:
426		status = ipq_set_mode(pmsg->msg.mode.value,
427		                      pmsg->msg.mode.range);
428		break;
429
430	case IPQM_VERDICT:
431		if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
432			status = -EINVAL;
433		else
434			status = ipq_set_verdict(&pmsg->msg.verdict,
435			                         len - sizeof(*pmsg));
436			break;
437	default:
438		status = -EINVAL;
439	}
440	return status;
441}
442
443static int
444dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex)
445{
446	if (entry->info->indev)
447		if (entry->info->indev->ifindex == ifindex)
448			return 1;
449
450	if (entry->info->outdev)
451		if (entry->info->outdev->ifindex == ifindex)
452			return 1;
453
454	return 0;
455}
456
457static void
458ipq_dev_drop(int ifindex)
459{
460	struct ipq_queue_entry *entry;
461
462	while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL)
463		ipq_issue_verdict(entry, NF_DROP);
464}
465
466#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
467
468static inline void
469ipq_rcv_skb(struct sk_buff *skb)
470{
471	int status, type, pid, flags, nlmsglen, skblen;
472	struct nlmsghdr *nlh;
473
474	skblen = skb->len;
475	if (skblen < sizeof(*nlh))
476		return;
477
478	nlh = (struct nlmsghdr *)skb->data;
479	nlmsglen = nlh->nlmsg_len;
480	if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
481		return;
482
483	pid = nlh->nlmsg_pid;
484	flags = nlh->nlmsg_flags;
485
486	if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
487		RCV_SKB_FAIL(-EINVAL);
488
489	if (flags & MSG_TRUNC)
490		RCV_SKB_FAIL(-ECOMM);
491
492	type = nlh->nlmsg_type;
493	if (type < NLMSG_NOOP || type >= IPQM_MAX)
494		RCV_SKB_FAIL(-EINVAL);
495
496	if (type <= IPQM_BASE)
497		return;
498
499	if(!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN))
500		RCV_SKB_FAIL(-EPERM);
501
502	write_lock_bh(&queue_lock);
503
504	if (peer_pid) {
505		if (peer_pid != pid) {
506			write_unlock_bh(&queue_lock);
507			RCV_SKB_FAIL(-EBUSY);
508		}
509	}
510	else
511		peer_pid = pid;
512
513	write_unlock_bh(&queue_lock);
514
515	status = ipq_receive_peer(NLMSG_DATA(nlh), type,
516	                          skblen - NLMSG_LENGTH(0));
517	if (status < 0)
518		RCV_SKB_FAIL(status);
519
520	if (flags & NLM_F_ACK)
521		netlink_ack(skb, nlh, 0);
522        return;
523}
524
525static void
526ipq_rcv_sk(struct sock *sk, int len)
527{
528	do {
529		struct sk_buff *skb;
530
531		if (down_trylock(&ipqnl_sem))
532			return;
533
534		while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
535			ipq_rcv_skb(skb);
536			kfree_skb(skb);
537		}
538
539		up(&ipqnl_sem);
540
541	} while (ipqnl && ipqnl->receive_queue.qlen);
542}
543
544static int
545ipq_rcv_dev_event(struct notifier_block *this,
546                  unsigned long event, void *ptr)
547{
548	struct net_device *dev = ptr;
549
550	/* Drop any packets associated with the downed device */
551	if (event == NETDEV_DOWN)
552		ipq_dev_drop(dev->ifindex);
553	return NOTIFY_DONE;
554}
555
556static struct notifier_block ipq_dev_notifier = {
557	ipq_rcv_dev_event,
558	NULL,
559	0
560};
561
562static int
563ipq_rcv_nl_event(struct notifier_block *this,
564                 unsigned long event, void *ptr)
565{
566	struct netlink_notify *n = ptr;
567
568	if (event == NETLINK_URELEASE &&
569	    n->protocol == NETLINK_FIREWALL && n->pid) {
570		write_lock_bh(&queue_lock);
571		if (n->pid == peer_pid)
572			__ipq_reset();
573		write_unlock_bh(&queue_lock);
574	}
575	return NOTIFY_DONE;
576}
577
578static struct notifier_block ipq_nl_notifier = {
579	ipq_rcv_nl_event,
580	NULL,
581	0
582};
583
584static int sysctl_maxlen = IPQ_QMAX_DEFAULT;
585static struct ctl_table_header *ipq_sysctl_header;
586
587static ctl_table ipq_table[] = {
588	{ NET_IPQ_QMAX, NET_IPQ_QMAX_NAME, &sysctl_maxlen,
589	  sizeof(sysctl_maxlen), 0644,  NULL, proc_dointvec },
590 	{ 0 }
591};
592
593static ctl_table ipq_dir_table[] = {
594	{NET_IPV4, "ipv4", NULL, 0, 0555, ipq_table, 0, 0, 0, 0, 0},
595	{ 0 }
596};
597
598static ctl_table ipq_root_table[] = {
599	{CTL_NET, "net", NULL, 0, 0555, ipq_dir_table, 0, 0, 0, 0, 0},
600	{ 0 }
601};
602
603static int
604ipq_get_info(char *buffer, char **start, off_t offset, int length)
605{
606	int len;
607
608	read_lock_bh(&queue_lock);
609
610	len = sprintf(buffer,
611	              "Peer PID          : %d\n"
612	              "Copy mode         : %hu\n"
613	              "Copy range        : %u\n"
614	              "Queue length      : %u\n"
615	              "Queue max. length : %u\n",
616	              peer_pid,
617	              copy_mode,
618	              copy_range,
619	              queue_total,
620	              queue_maxlen);
621
622	read_unlock_bh(&queue_lock);
623
624	*start = buffer + offset;
625	len -= offset;
626	if (len > length)
627		len = length;
628	else if (len < 0)
629		len = 0;
630	return len;
631}
632
633static int
634init_or_cleanup(int init)
635{
636	int status = -ENOMEM;
637	struct proc_dir_entry *proc;
638
639	if (!init)
640		goto cleanup;
641
642	netlink_register_notifier(&ipq_nl_notifier);
643	ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk);
644	if (ipqnl == NULL) {
645		printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
646		goto cleanup_netlink_notifier;
647	}
648
649	proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info);
650	if (proc)
651		proc->owner = THIS_MODULE;
652	else {
653		printk(KERN_ERR "ip_queue: failed to create proc entry\n");
654		goto cleanup_ipqnl;
655	}
656
657	register_netdevice_notifier(&ipq_dev_notifier);
658	ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
659
660	status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);
661	if (status < 0) {
662		printk(KERN_ERR "ip_queue: failed to register queue handler\n");
663		goto cleanup_sysctl;
664	}
665	return status;
666
667cleanup:
668	nf_unregister_queue_handler(PF_INET);
669	br_write_lock_bh(BR_NETPROTO_LOCK);
670	br_write_unlock_bh(BR_NETPROTO_LOCK);
671	ipq_flush(NF_DROP);
672
673cleanup_sysctl:
674	unregister_sysctl_table(ipq_sysctl_header);
675	unregister_netdevice_notifier(&ipq_dev_notifier);
676	proc_net_remove(IPQ_PROC_FS_NAME);
677
678cleanup_ipqnl:
679	sock_release(ipqnl->socket);
680	down(&ipqnl_sem);
681	up(&ipqnl_sem);
682
683cleanup_netlink_notifier:
684	netlink_unregister_notifier(&ipq_nl_notifier);
685	return status;
686}
687
688static int __init init(void)
689{
690
691	return init_or_cleanup(1);
692}
693
694static void __exit fini(void)
695{
696	init_or_cleanup(0);
697}
698
699MODULE_DESCRIPTION("IPv4 packet queue handler");
700MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
701MODULE_LICENSE("GPL");
702
703module_init(init);
704module_exit(fini);
705