1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		RAW - implementation of IP "raw" sockets.
7 *
8 * Version:	$Id: raw.c,v 1.1.1.1 2008/10/15 03:27:33 james26_jang Exp $
9 *
10 * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *
13 * Fixes:
14 *		Alan Cox	:	verify_area() fixed up
15 *		Alan Cox	:	ICMP error handling
16 *		Alan Cox	:	EMSGSIZE if you send too big a packet
17 *		Alan Cox	: 	Now uses generic datagrams and shared
18 *					skbuff library. No more peek crashes,
19 *					no more backlogs
20 *		Alan Cox	:	Checks sk->broadcast.
21 *		Alan Cox	:	Uses skb_free_datagram/skb_copy_datagram
22 *		Alan Cox	:	Raw passes ip options too
23 *		Alan Cox	:	Setsocketopt added
24 *		Alan Cox	:	Fixed error return for broadcasts
25 *		Alan Cox	:	Removed wake_up calls
26 *		Alan Cox	:	Use ttl/tos
27 *		Alan Cox	:	Cleaned up old debugging
28 *		Alan Cox	:	Use new kernel side addresses
29 *	Arnt Gulbrandsen	:	Fixed MSG_DONTROUTE in raw sockets.
30 *		Alan Cox	:	BSD style RAW socket demultiplexing.
31 *		Alan Cox	:	Beginnings of mrouted support.
32 *		Alan Cox	:	Added IP_HDRINCL option.
33 *		Alan Cox	:	Skip broadcast check if BSDism set.
34 *		David S. Miller	:	New socket lookup architecture.
35 *
36 *		This program is free software; you can redistribute it and/or
37 *		modify it under the terms of the GNU General Public License
38 *		as published by the Free Software Foundation; either version
39 *		2 of the License, or (at your option) any later version.
40 */
41
42#include <linux/config.h>
43#include <asm/system.h>
44#include <asm/uaccess.h>
45#include <asm/ioctls.h>
46#include <linux/types.h>
47#include <linux/sched.h>
48#include <linux/errno.h>
49#include <linux/timer.h>
50#include <linux/mm.h>
51#include <linux/kernel.h>
52#include <linux/fcntl.h>
53#include <linux/socket.h>
54#include <linux/in.h>
55#include <linux/inet.h>
56#include <linux/netdevice.h>
57#include <linux/mroute.h>
58#include <net/ip.h>
59#include <net/protocol.h>
60#include <linux/skbuff.h>
61#include <net/sock.h>
62#include <net/icmp.h>
63#include <net/udp.h>
64#include <net/raw.h>
65#include <net/inet_common.h>
66#include <net/checksum.h>
67
68struct sock *raw_v4_htable[RAWV4_HTABLE_SIZE];
69rwlock_t raw_v4_lock = RW_LOCK_UNLOCKED;
70
71static void raw_v4_hash(struct sock *sk)
72{
73	struct sock **skp = &raw_v4_htable[sk->num & (RAWV4_HTABLE_SIZE - 1)];
74
75	write_lock_bh(&raw_v4_lock);
76	if ((sk->next = *skp) != NULL)
77		(*skp)->pprev = &sk->next;
78	*skp = sk;
79	sk->pprev = skp;
80	sock_prot_inc_use(sk->prot);
81 	sock_hold(sk);
82	write_unlock_bh(&raw_v4_lock);
83}
84
85static void raw_v4_unhash(struct sock *sk)
86{
87 	write_lock_bh(&raw_v4_lock);
88	if (sk->pprev) {
89		if (sk->next)
90			sk->next->pprev = sk->pprev;
91		*sk->pprev = sk->next;
92		sk->pprev = NULL;
93		sock_prot_dec_use(sk->prot);
94		__sock_put(sk);
95	}
96	write_unlock_bh(&raw_v4_lock);
97}
98
99struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
100			     unsigned long raddr, unsigned long laddr,
101			     int dif)
102{
103	struct sock *s = sk;
104
105	for (s = sk; s; s = s->next) {
106		if (s->num == num 				&&
107		    !(s->daddr && s->daddr != raddr) 		&&
108		    !(s->rcv_saddr && s->rcv_saddr != laddr)	&&
109		    !(s->bound_dev_if && s->bound_dev_if != dif))
110			break; /* gotcha */
111	}
112	return s;
113}
114
115/*
116 *	0 - deliver
117 *	1 - block
118 */
119static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
120{
121	int type;
122
123	type = skb->h.icmph->type;
124	if (type < 32) {
125		__u32 data = sk->tp_pinfo.tp_raw4.filter.data;
126
127		return ((1 << type) & data) != 0;
128	}
129
130	/* Do not block unknown ICMP types */
131	return 0;
132}
133
134/* IP input processing comes here for RAW socket delivery.
135 * This is fun as to avoid copies we want to make no surplus
136 * copies.
137 *
138 * RFC 1122: SHOULD pass TOS value up to the transport layer.
139 * -> It does. And not only TOS, but all IP header.
140 */
141struct sock *raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
142{
143	struct sock *sk;
144
145	read_lock(&raw_v4_lock);
146	if ((sk = raw_v4_htable[hash]) == NULL)
147		goto out;
148	sk = __raw_v4_lookup(sk, iph->protocol,
149			     iph->saddr, iph->daddr,
150			     skb->dev->ifindex);
151
152	while (sk) {
153		struct sock *sknext = __raw_v4_lookup(sk->next, iph->protocol,
154						      iph->saddr, iph->daddr,
155						      skb->dev->ifindex);
156		if (iph->protocol != IPPROTO_ICMP ||
157		    !icmp_filter(sk, skb)) {
158			struct sk_buff *clone;
159
160			if (!sknext)
161				break;
162			clone = skb_clone(skb, GFP_ATOMIC);
163			/* Not releasing hash table! */
164			if (clone)
165				raw_rcv(sk, clone);
166		}
167		sk = sknext;
168	}
169out:
170	if (sk)
171		sock_hold(sk);
172	read_unlock(&raw_v4_lock);
173
174	return sk;
175}
176
177void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
178{
179	int type = skb->h.icmph->type;
180	int code = skb->h.icmph->code;
181	int err = 0;
182	int harderr = 0;
183
184	/* Report error on raw socket, if:
185	   1. User requested ip_recverr.
186	   2. Socket is connected (otherwise the error indication
187	      is useless without ip_recverr and error is hard.
188	 */
189	if (!sk->protinfo.af_inet.recverr && sk->state != TCP_ESTABLISHED)
190		return;
191
192	switch (type) {
193	default:
194	case ICMP_TIME_EXCEEDED:
195		err = EHOSTUNREACH;
196		break;
197	case ICMP_SOURCE_QUENCH:
198		return;
199	case ICMP_PARAMETERPROB:
200		err = EPROTO;
201		harderr = 1;
202		break;
203	case ICMP_DEST_UNREACH:
204		err = EHOSTUNREACH;
205		if (code > NR_ICMP_UNREACH)
206			break;
207		err = icmp_err_convert[code].errno;
208		harderr = icmp_err_convert[code].fatal;
209		if (code == ICMP_FRAG_NEEDED) {
210			harderr = sk->protinfo.af_inet.pmtudisc !=
211					IP_PMTUDISC_DONT;
212			err = EMSGSIZE;
213		}
214	}
215
216	if (sk->protinfo.af_inet.recverr) {
217		struct iphdr *iph = (struct iphdr*)skb->data;
218		u8 *payload = skb->data + (iph->ihl << 2);
219
220		if (sk->protinfo.af_inet.hdrincl)
221			payload = skb->data;
222		ip_icmp_error(sk, skb, err, 0, info, payload);
223	}
224
225	if (sk->protinfo.af_inet.recverr || harderr) {
226		sk->err = err;
227		sk->error_report(sk);
228	}
229}
230
231static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
232{
233	/* Charge it to the socket. */
234
235	if (sock_queue_rcv_skb(sk, skb) < 0) {
236		IP_INC_STATS(IpInDiscards);
237		kfree_skb(skb);
238		return NET_RX_DROP;
239	}
240
241	IP_INC_STATS(IpInDelivers);
242	return NET_RX_SUCCESS;
243}
244
245int raw_rcv(struct sock *sk, struct sk_buff *skb)
246{
247	skb_push(skb, skb->data - skb->nh.raw);
248
249	raw_rcv_skb(sk, skb);
250	return 0;
251}
252
253struct rawfakehdr
254{
255	struct	iovec *iov;
256	u32	saddr;
257	struct	dst_entry *dst;
258};
259
260/*
261 *	Send a RAW IP packet.
262 */
263
264/*
265 *	Callback support is trivial for SOCK_RAW
266 */
267
268static int raw_getfrag(const void *p, char *to, unsigned int offset,
269			unsigned int fraglen)
270{
271	struct rawfakehdr *rfh = (struct rawfakehdr *) p;
272	return memcpy_fromiovecend(to, rfh->iov, offset, fraglen);
273}
274
275/*
276 *	IPPROTO_RAW needs extra work.
277 */
278
279static int raw_getrawfrag(const void *p, char *to, unsigned int offset,
280				unsigned int fraglen)
281{
282	struct rawfakehdr *rfh = (struct rawfakehdr *) p;
283
284	if (memcpy_fromiovecend(to, rfh->iov, offset, fraglen))
285		return -EFAULT;
286
287	if (!offset) {
288		struct iphdr *iph = (struct iphdr *)to;
289		if (!iph->saddr)
290			iph->saddr = rfh->saddr;
291		iph->check   = 0;
292		iph->tot_len = htons(fraglen); /* This is right as you can't
293						  frag RAW packets */
294		/*
295	 	 *	Deliberate breach of modularity to keep
296	 	 *	ip_build_xmit clean (well less messy).
297		 */
298		if (!iph->id)
299			ip_select_ident(iph, rfh->dst, NULL);
300		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
301	}
302	return 0;
303}
304
305static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len)
306{
307	struct ipcm_cookie ipc;
308	struct rawfakehdr rfh;
309	struct rtable *rt = NULL;
310	int free = 0;
311	u32 daddr;
312	u8  tos;
313	int err;
314
315	/* This check is ONLY to check for arithmetic overflow
316	   on integer(!) len. Not more! Real check will be made
317	   in ip_build_xmit --ANK
318
319	   BTW socket.c -> af_*.c -> ... make multiple
320	   invalid conversions size_t -> int. We MUST repair it f.e.
321	   by replacing all of them with size_t and revise all
322	   the places sort of len += sizeof(struct iphdr)
323	   If len was ULONG_MAX-10 it would be cathastrophe  --ANK
324	 */
325
326	err = -EMSGSIZE;
327	if (len < 0 || len > 0xFFFF)
328		goto out;
329
330	/*
331	 *	Check the flags.
332	 */
333
334	err = -EOPNOTSUPP;
335	if (msg->msg_flags & MSG_OOB)	/* Mirror BSD error message */
336		goto out;               /* compatibility */
337
338	/*
339	 *	Get and verify the address.
340	 */
341
342	if (msg->msg_namelen) {
343		struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name;
344		err = -EINVAL;
345		if (msg->msg_namelen < sizeof(*usin))
346			goto out;
347		if (usin->sin_family != AF_INET) {
348			static int complained;
349			if (!complained++)
350				printk(KERN_INFO "%s forgot to set AF_INET in "
351						 "raw sendmsg. Fix it!\n",
352						 current->comm);
353			err = -EINVAL;
354			if (usin->sin_family)
355				goto out;
356		}
357		daddr = usin->sin_addr.s_addr;
358		/* ANK: I did not forget to get protocol from port field.
359		 * I just do not know, who uses this weirdness.
360		 * IP_HDRINCL is much more convenient.
361		 */
362	} else {
363		err = -EINVAL;
364		if (sk->state != TCP_ESTABLISHED)
365			goto out;
366		daddr = sk->daddr;
367	}
368
369	ipc.addr = sk->saddr;
370	ipc.opt = NULL;
371	ipc.oif = sk->bound_dev_if;
372
373	if (msg->msg_controllen) {
374		err = ip_cmsg_send(msg, &ipc);
375		if (err)
376			goto out;
377		if (ipc.opt)
378			free = 1;
379	}
380
381	rfh.saddr = ipc.addr;
382	ipc.addr = daddr;
383
384	if (!ipc.opt)
385		ipc.opt = sk->protinfo.af_inet.opt;
386
387	if (ipc.opt) {
388		err = -EINVAL;
389		/* Linux does not mangle headers on raw sockets,
390		 * so that IP options + IP_HDRINCL is non-sense.
391		 */
392		if (sk->protinfo.af_inet.hdrincl)
393			goto done;
394		if (ipc.opt->srr) {
395			if (!daddr)
396				goto done;
397			daddr = ipc.opt->faddr;
398		}
399	}
400	tos = RT_TOS(sk->protinfo.af_inet.tos) | sk->localroute;
401	if (msg->msg_flags & MSG_DONTROUTE)
402		tos |= RTO_ONLINK;
403
404	if (MULTICAST(daddr)) {
405		if (!ipc.oif)
406			ipc.oif = sk->protinfo.af_inet.mc_index;
407		if (!rfh.saddr)
408			rfh.saddr = sk->protinfo.af_inet.mc_addr;
409	}
410
411	err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif);
412
413	if (err)
414		goto done;
415
416	err = -EACCES;
417	if (rt->rt_flags & RTCF_BROADCAST && !sk->broadcast)
418		goto done;
419
420	if (msg->msg_flags & MSG_CONFIRM)
421		goto do_confirm;
422back_from_confirm:
423
424	rfh.iov		= msg->msg_iov;
425	rfh.saddr	= rt->rt_src;
426	rfh.dst		= &rt->u.dst;
427	if (!ipc.addr)
428		ipc.addr = rt->rt_dst;
429	err = ip_build_xmit(sk, sk->protinfo.af_inet.hdrincl ? raw_getrawfrag :
430		       	    raw_getfrag, &rfh, len, &ipc, rt, msg->msg_flags);
431
432done:
433	if (free)
434		kfree(ipc.opt);
435	ip_rt_put(rt);
436
437out:	return err < 0 ? err : len;
438
439do_confirm:
440	dst_confirm(&rt->u.dst);
441	if (!(msg->msg_flags & MSG_PROBE) || len)
442		goto back_from_confirm;
443	err = 0;
444	goto done;
445}
446
447static void raw_close(struct sock *sk, long timeout)
448{
449        /*
450	 * Raw sockets may have direct kernel refereneces. Kill them.
451	 */
452	ip_ra_control(sk, 0, NULL);
453
454	inet_sock_release(sk);
455}
456
457/* This gets rid of all the nasties in af_inet. -DaveM */
458static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
459{
460	struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
461	int ret = -EINVAL;
462	int chk_addr_ret;
463
464	if (sk->state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
465		goto out;
466	chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
467	ret = -EADDRNOTAVAIL;
468	if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
469	    chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
470		goto out;
471	sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;
472	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
473		sk->saddr = 0;  /* Use device */
474	sk_dst_reset(sk);
475	ret = 0;
476out:	return ret;
477}
478
479/*
480 *	This should be easy, if there is something there
481 *	we return it, otherwise we block.
482 */
483
484int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len,
485		int noblock, int flags, int *addr_len)
486{
487	int copied = 0;
488	int err = -EOPNOTSUPP;
489	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
490	struct sk_buff *skb;
491
492	if (flags & MSG_OOB)
493		goto out;
494
495	if (addr_len)
496		*addr_len = sizeof(*sin);
497
498	if (flags & MSG_ERRQUEUE) {
499		err = ip_recv_error(sk, msg, len);
500		goto out;
501	}
502
503	skb = skb_recv_datagram(sk, flags, noblock, &err);
504	if (!skb)
505		goto out;
506
507	copied = skb->len;
508	if (len < copied) {
509		msg->msg_flags |= MSG_TRUNC;
510		copied = len;
511	}
512
513	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
514	if (err)
515		goto done;
516
517	sock_recv_timestamp(msg, sk, skb);
518
519	/* Copy the address. */
520	if (sin) {
521		sin->sin_family = AF_INET;
522		sin->sin_addr.s_addr = skb->nh.iph->saddr;
523		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
524	}
525	if (sk->protinfo.af_inet.cmsg_flags)
526		ip_cmsg_recv(msg, skb);
527done:
528	skb_free_datagram(sk, skb);
529out:	return err ? : copied;
530}
531
532static int raw_init(struct sock *sk)
533{
534	struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4);
535	if (sk->num == IPPROTO_ICMP)
536		memset(&tp->filter, 0, sizeof(tp->filter));
537	return 0;
538}
539
540static int raw_seticmpfilter(struct sock *sk, char *optval, int optlen)
541{
542	if (optlen > sizeof(struct icmp_filter))
543		optlen = sizeof(struct icmp_filter);
544	if (copy_from_user(&sk->tp_pinfo.tp_raw4.filter, optval, optlen))
545		return -EFAULT;
546	return 0;
547}
548
549static int raw_geticmpfilter(struct sock *sk, char *optval, int *optlen)
550{
551	int len, ret = -EFAULT;
552
553	if (get_user(len, optlen))
554		goto out;
555	ret = -EINVAL;
556	if (len < 0)
557		goto out;
558	if (len > sizeof(struct icmp_filter))
559		len = sizeof(struct icmp_filter);
560	ret = -EFAULT;
561	if (put_user(len, optlen) ||
562	    copy_to_user(optval, &sk->tp_pinfo.tp_raw4.filter, len))
563		goto out;
564	ret = 0;
565out:	return ret;
566}
567
568static int raw_setsockopt(struct sock *sk, int level, int optname,
569			  char *optval, int optlen)
570{
571	if (level != SOL_RAW)
572		return ip_setsockopt(sk, level, optname, optval, optlen);
573
574	if (optname == ICMP_FILTER) {
575		if (sk->num != IPPROTO_ICMP)
576			return -EOPNOTSUPP;
577		else
578			return raw_seticmpfilter(sk, optval, optlen);
579	}
580	return -ENOPROTOOPT;
581}
582
583static int raw_getsockopt(struct sock *sk, int level, int optname,
584			  char *optval, int *optlen)
585{
586	if (level != SOL_RAW)
587		return ip_getsockopt(sk, level, optname, optval, optlen);
588
589	if (optname == ICMP_FILTER) {
590		if (sk->num != IPPROTO_ICMP)
591			return -EOPNOTSUPP;
592		else
593			return raw_geticmpfilter(sk, optval, optlen);
594	}
595	return -ENOPROTOOPT;
596}
597
598static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
599{
600	switch (cmd) {
601		case SIOCOUTQ: {
602			int amount = atomic_read(&sk->wmem_alloc);
603			return put_user(amount, (int *)arg);
604		}
605		case SIOCINQ: {
606			struct sk_buff *skb;
607			int amount = 0;
608
609			spin_lock_irq(&sk->receive_queue.lock);
610			skb = skb_peek(&sk->receive_queue);
611			if (skb != NULL)
612				amount = skb->len;
613			spin_unlock_irq(&sk->receive_queue.lock);
614			return put_user(amount, (int *)arg);
615		}
616
617		default:
618#ifdef CONFIG_IP_MROUTE
619			return ipmr_ioctl(sk, cmd, arg);
620#else
621			return -ENOIOCTLCMD;
622#endif
623	}
624}
625
626static void get_raw_sock(struct sock *sp, char *tmpbuf, int i)
627{
628	unsigned int dest = sp->daddr,
629		     src = sp->rcv_saddr;
630	__u16 destp = 0,
631	      srcp  = sp->num;
632
633	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
634		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p",
635		i, src, srcp, dest, destp, sp->state,
636		atomic_read(&sp->wmem_alloc), atomic_read(&sp->rmem_alloc),
637		0, 0L, 0,
638		sock_i_uid(sp), 0,
639		sock_i_ino(sp),
640		atomic_read(&sp->refcnt), sp);
641}
642
643int raw_get_info(char *buffer, char **start, off_t offset, int length)
644{
645	int len = 0, num = 0, i;
646	off_t pos = 128;
647	off_t begin;
648	char tmpbuf[129];
649
650	if (offset < 128)
651		len += sprintf(buffer, "%-127s\n",
652			       "  sl  local_address rem_address   st tx_queue "
653			       "rx_queue tr tm->when retrnsmt   uid  timeout "
654			       "inode");
655	read_lock(&raw_v4_lock);
656	for (i = 0; i < RAWV4_HTABLE_SIZE; i++) {
657		struct sock *sk;
658
659		for (sk = raw_v4_htable[i]; sk; sk = sk->next, num++) {
660			if (sk->family != PF_INET)
661				continue;
662			pos += 128;
663			if (pos <= offset)
664				continue;
665			get_raw_sock(sk, tmpbuf, i);
666			len += sprintf(buffer + len, "%-127s\n", tmpbuf);
667			if (len >= length)
668				goto out;
669		}
670	}
671out:
672	read_unlock(&raw_v4_lock);
673	begin = len - (pos - offset);
674	*start = buffer + begin;
675	len -= begin;
676	if (len > length)
677		len = length;
678	if (len < 0)
679		len = 0;
680	return len;
681}
682
683struct proto raw_prot = {
684	name:		"RAW",
685	close:		raw_close,
686	connect:	udp_connect,
687	disconnect:	udp_disconnect,
688	ioctl:		raw_ioctl,
689	init:		raw_init,
690	setsockopt:	raw_setsockopt,
691	getsockopt:	raw_getsockopt,
692	sendmsg:	raw_sendmsg,
693	recvmsg:	raw_recvmsg,
694	bind:		raw_bind,
695	backlog_rcv:	raw_rcv_skb,
696	hash:		raw_v4_hash,
697	unhash:		raw_v4_unhash,
698};
699