flowacct.c revision 4321:a8930ec16e52
1178825Sdfr/*
2233294Sstas * CDDL HEADER START
3233294Sstas *
4233294Sstas * The contents of this file are subject to the terms of the
5178825Sdfr * Common Development and Distribution License (the "License").
6233294Sstas * You may not use this file except in compliance with the License.
7233294Sstas *
8233294Sstas * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9178825Sdfr * or http://www.opensolaris.org/os/licensing.
10233294Sstas * See the License for the specific language governing permissions
11233294Sstas * and limitations under the License.
12178825Sdfr *
13233294Sstas * When distributing Covered Code, include this CDDL HEADER in each
14233294Sstas * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15233294Sstas * If applicable, add the following below this CDDL HEADER, with the
16178825Sdfr * fields enclosed by brackets "[]" replaced with your own identifying
17233294Sstas * information: Portions Copyright [yyyy] [name of copyright owner]
18233294Sstas *
19233294Sstas * CDDL HEADER END
20178825Sdfr */
21233294Sstas
22233294Sstas/*
23233294Sstas * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24233294Sstas * Use is subject to license terms.
25233294Sstas */
26233294Sstas
27233294Sstas#pragma ident	"%Z%%M%	%I%	%E% SMI"
28233294Sstas
29233294Sstas#include <sys/types.h>
30233294Sstas#include <sys/kmem.h>
31233294Sstas#include <sys/conf.h>
32178825Sdfr#include <sys/atomic.h>
33178825Sdfr#include <netinet/in.h>
34233294Sstas#include <netinet/in_systm.h>
35178825Sdfr#include <netinet/ip6.h>
36178825Sdfr#include <sys/socket.h>
37178825Sdfr#include <sys/acct.h>
38178825Sdfr#include <sys/exacct.h>
39233294Sstas#include <inet/common.h>
40178825Sdfr#include <inet/ip.h>
41178825Sdfr#include <inet/ip6.h>
42178825Sdfr#include <sys/ddi.h>
43178825Sdfr#include <sys/strsun.h>
44178825Sdfr#include <ipp/flowacct/flowacct_impl.h>
45178825Sdfr
46178825Sdfr/*
47178825Sdfr * flowacct - IPQoS accounting module. The module maintains an array
48178825Sdfr * of 256 hash buckets. When the action routine is invoked for a flow,
49178825Sdfr * if the flow (identified by the 5-tuple: saddr, daddr, sport, dport, proto)
50178825Sdfr * is already present in the flow table (indexed by the hash function FLOW_HASH)
51178825Sdfr * then a check is made to see if an item for this flow with the same
52178825Sdfr * dsfield, projid & user id is present. If it is, then the number of packets
53178825Sdfr * and the bytes are incremented for that item. If the item does
54178825Sdfr * not exist a new item is added for the flow. If the flow is not present
55178825Sdfr * an entry is made for the flow.
56178825Sdfr *
57178825Sdfr * A timer runs thru the table and writes all the flow items that have
58178825Sdfr * timed out to the accounting file (via exacct PSARC/1999/119), if present
59178825Sdfr * Configuration commands to change the timing interval is provided. The
60178825Sdfr * flow timeout value can also be configured. While the timeout is in nsec,
61178825Sdfr * the flow timer interval is in usec.
62178825Sdfr * Information for an active flow can be obtained by using kstats.
63178825Sdfr */
64178825Sdfr
65178825Sdfr/* Used in computing the hash index */
66178825Sdfr#define	FLOWACCT_ADDR_HASH(addr) 			\
67178825Sdfr	((addr).s6_addr8[8] ^ (addr).s6_addr8[9] ^ 	\
68178825Sdfr	(addr).s6_addr8[10] ^ (addr).s6_addr8[13] ^ 	\
69178825Sdfr	(addr).s6_addr8[14] ^ (addr).s6_addr8[15])
70178825Sdfr
71178825Sdfr#define	FLOWACCT_FLOW_HASH(f)				\
72178825Sdfr	(((FLOWACCT_ADDR_HASH(f->saddr)) + 		\
73178825Sdfr	(FLOWACCT_ADDR_HASH(f->daddr)) + 		\
74178825Sdfr	(f->proto) + (f->sport) + (f->dport)) 		\
75178825Sdfr	% FLOW_TBL_COUNT)
76178825Sdfr
77178825Sdfr/*
78 * Compute difference between a and b in nsec and store in delta.
79 * delta should be a hrtime_t. Taken from ip_mroute.c.
80 */
81#define	FLOWACCT_DELTA(a, b, delta) { \
82	int xxs; \
83 \
84	delta = (a).tv_nsec - (b).tv_nsec; \
85	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
86		switch (xxs) { \
87		case 2: \
88		    delta += NANOSEC; \
89		    /*FALLTHRU*/ \
90		case 1: \
91		    delta += NANOSEC; \
92		    break; \
93		default: \
94		    delta += ((hrtime_t)NANOSEC * xxs); \
95		} \
96	} \
97}
98
99/* Debug level */
100int flowacct_debug = 0;
101
102/* Collect timed out flows to be written to the accounting file */
103typedef struct flow_records_s {
104	flow_usage_t *fl_use;
105	struct flow_records_s *next;
106}flow_records_t;
107
108/* Get port information from the packet. Ignore fragments. */
109static void
110flowacct_port_info(header_t *header, void *iph, int af, mblk_t *mp)
111{
112	uint16_t *up;
113
114	if (af == AF_INET) {
115		ipha_t *ipha = (ipha_t *)iph;
116		uint32_t u2, u1;
117		uint_t iplen;
118
119		u2 = ntohs(ipha->ipha_fragment_offset_and_flags);
120		u1 = u2 & (IPH_MF | IPH_OFFSET);
121		if (u1 != 0) {
122			return;
123		}
124		iplen = (ipha->ipha_version_and_hdr_length & 0xF) << 2;
125		up = (uint16_t *)(mp->b_rptr + iplen);
126		header->sport = (uint16_t)*up++;
127		header->dport = (uint16_t)*up;
128	} else {
129		ip6_t *ip6h = (ip6_t *)iph;
130		uint_t  length = IPV6_HDR_LEN;
131		uint_t  ehdrlen;
132		uint8_t *nexthdrp, *whereptr, *endptr;
133		ip6_dest_t *desthdr;
134		ip6_rthdr_t *rthdr;
135		ip6_hbh_t *hbhhdr;
136
137		whereptr = ((uint8_t *)&ip6h[1]);
138		endptr = mp->b_wptr;
139		nexthdrp = &ip6h->ip6_nxt;
140		while (whereptr < endptr) {
141			switch (*nexthdrp) {
142			case IPPROTO_HOPOPTS:
143				hbhhdr = (ip6_hbh_t *)whereptr;
144				ehdrlen = 8 * (hbhhdr->ip6h_len + 1);
145				if ((uchar_t *)hbhhdr +  ehdrlen > endptr)
146					return;
147				nexthdrp = &hbhhdr->ip6h_nxt;
148				break;
149			case IPPROTO_DSTOPTS:
150				desthdr = (ip6_dest_t *)whereptr;
151				ehdrlen = 8 * (desthdr->ip6d_len + 1);
152				if ((uchar_t *)desthdr +  ehdrlen > endptr)
153					return;
154				nexthdrp = &desthdr->ip6d_nxt;
155				break;
156			case IPPROTO_ROUTING:
157				rthdr = (ip6_rthdr_t *)whereptr;
158				ehdrlen =  8 * (rthdr->ip6r_len + 1);
159				if ((uchar_t *)rthdr +  ehdrlen > endptr)
160					return;
161				nexthdrp = &rthdr->ip6r_nxt;
162				break;
163			case IPPROTO_FRAGMENT:
164				return;
165			case IPPROTO_TCP:
166			case IPPROTO_UDP:
167			case IPPROTO_SCTP:
168				/*
169				 * Verify we have at least ICMP_MIN_TP_HDR_LEN
170				 * bytes of the ULP's header to get the port
171				 * info.
172				 */
173				if (((uchar_t *)ip6h + length +
174				    ICMP_MIN_TP_HDR_LEN)  > endptr) {
175					return;
176				}
177				/* Get the protocol & ports */
178				header->proto = *nexthdrp;
179				up = (uint16_t *)((uchar_t *)ip6h + length);
180				header->sport = (uint16_t)*up++;
181				header->dport = (uint16_t)*up;
182				return;
183			case IPPROTO_ICMPV6:
184			case IPPROTO_ENCAP:
185			case IPPROTO_IPV6:
186			case IPPROTO_ESP:
187			case IPPROTO_AH:
188				header->proto = *nexthdrp;
189				return;
190			case IPPROTO_NONE:
191			default:
192				return;
193			}
194			length += ehdrlen;
195			whereptr += ehdrlen;
196		}
197	}
198}
199
200/*
201 * flowacct_find_ids(mp, header)
202 *
203 * attempt to discern the uid and projid of the originator of a packet by
204 * looking at the dblks making up the packet - yeuch!
205 *
206 * We do it by skipping any fragments with a credp of NULL (originated in
207 * kernel), taking the first value that isn't NULL to be the cred_t for the
208 * whole packet.
209 */
210static void
211flowacct_find_ids(mblk_t *mp, header_t *header)
212{
213	cred_t *cr;
214
215	while (DB_CRED(mp) == NULL && mp->b_cont != NULL)
216		mp = mp->b_cont;
217
218	if ((cr = DB_CRED(mp)) != NULL) {
219		header->uid = crgetuid(cr);
220		header->projid = crgetprojid(cr);
221	} else {
222		header->uid = (uid_t)-1;
223		header->projid = -1;
224	}
225}
226
227/*
228 * Extract header information in a header_t structure so that we don't have
229 * have to parse the packet everytime.
230 */
231static int
232flowacct_extract_header(mblk_t *mp, header_t *header)
233{
234	ipha_t *ipha;
235	ip6_t *ip6h;
236#define	rptr	((uchar_t *)ipha)
237
238	/* 0 means no port extracted. */
239	header->sport = 0;
240	header->dport = 0;
241	flowacct_find_ids(mp, header);
242
243	V6_SET_ZERO(header->saddr);
244	V6_SET_ZERO(header->daddr);
245
246	ipha = (ipha_t *)mp->b_rptr;
247	header->isv4 = IPH_HDR_VERSION(ipha) == IPV4_VERSION;
248	if (header->isv4) {
249		ipha = (ipha_t *)mp->b_rptr;
250		V4_PART_OF_V6(header->saddr) = (int32_t)ipha->ipha_src;
251		V4_PART_OF_V6(header->daddr) = (int32_t)ipha->ipha_dst;
252		header->dsfield = ipha->ipha_type_of_service;
253		header->proto = ipha->ipha_protocol;
254		header->pktlen = ntohs(ipha->ipha_length);
255		if ((header->proto == IPPROTO_TCP) ||
256		    (header->proto == IPPROTO_UDP) ||
257		    (header->proto == IPPROTO_SCTP)) {
258			flowacct_port_info(header, ipha, AF_INET, mp);
259		}
260	} else {
261		/*
262		 * Need to pullup everything.
263		 */
264		if (mp->b_cont != NULL) {
265			if (!pullupmsg(mp, -1)) {
266				flowacct0dbg(("flowacct_extract_header: "\
267				    "pullup error"));
268				return (-1);
269			}
270		}
271		ip6h = (ip6_t *)mp->b_rptr;
272		bcopy(ip6h->ip6_src.s6_addr32, header->saddr.s6_addr32,
273		    sizeof (ip6h->ip6_src.s6_addr32));
274		bcopy(ip6h->ip6_dst.s6_addr32, header->daddr.s6_addr32,
275		    sizeof (ip6h->ip6_dst.s6_addr32));
276		header->dsfield = __IPV6_TCLASS_FROM_FLOW(ip6h->ip6_vcf);
277		header->proto = ip6h->ip6_nxt;
278		header->pktlen = ntohs(ip6h->ip6_plen) +
279		    ip_hdr_length_v6(mp, ip6h);
280		flowacct_port_info(header, ip6h, AF_INET6, mp);
281
282	}
283#undef	rptr
284	return (0);
285}
286
287/* Check if the flow (identified by the 5-tuple) exists in the hash table */
288static flow_t *
289flowacct_flow_present(header_t *header, int index,
290    flowacct_data_t *flowacct_data)
291{
292	list_hdr_t *hdr = flowacct_data->flows_tbl[index].head;
293	flow_t *flow;
294
295	while (hdr != NULL) {
296		flow = (flow_t *)hdr->objp;
297		if ((flow != NULL) &&
298		    (IN6_ARE_ADDR_EQUAL(&flow->saddr, &header->saddr)) &&
299		    (IN6_ARE_ADDR_EQUAL(&flow->daddr, &header->daddr)) &&
300		    (flow->proto == header->proto) &&
301		    (flow->sport == header->sport) &&
302		    (flow->dport == header->dport)) {
303			return (flow);
304		}
305		hdr = hdr->next;
306	}
307	return ((flow_t *)NULL);
308}
309
310/*
311 * Add an object to the list at insert_point. This could be a flow item or
312 * a flow itself.
313 */
314static list_hdr_t *
315flowacct_add_obj(list_head_t *tophdr, list_hdr_t *insert_point, void *obj)
316{
317	list_hdr_t *new_hdr;
318
319	if (tophdr == NULL) {
320		return ((list_hdr_t *)NULL);
321	}
322
323	new_hdr = (list_hdr_t *)kmem_zalloc(FLOWACCT_HDR_SZ, KM_NOSLEEP);
324	if (new_hdr == NULL) {
325		flowacct0dbg(("flowacct_add_obj: error allocating mem"));
326		return ((list_hdr_t *)NULL);
327	}
328	gethrestime(&new_hdr->last_seen);
329	new_hdr->objp = obj;
330	tophdr->nbr_items++;
331
332	if (insert_point == NULL) {
333		if (tophdr->head == NULL) {
334			tophdr->head = new_hdr;
335			tophdr->tail = new_hdr;
336			return (new_hdr);
337		}
338
339		new_hdr->next = tophdr->head;
340		tophdr->head->prev = new_hdr;
341		tophdr->head = new_hdr;
342		return (new_hdr);
343	}
344
345	if (insert_point == tophdr->tail) {
346		tophdr->tail->next = new_hdr;
347		new_hdr->prev = tophdr->tail;
348		tophdr->tail = new_hdr;
349		return (new_hdr);
350	}
351
352	new_hdr->next = insert_point->next;
353	new_hdr->prev = insert_point;
354	insert_point->next->prev = new_hdr;
355	insert_point->next = new_hdr;
356	return (new_hdr);
357}
358
359/* Delete an obj from the list. This could be a flow item or the flow itself */
360static void
361flowacct_del_obj(list_head_t *tophdr, list_hdr_t *hdr, uint_t mode)
362{
363	size_t	length;
364	uint_t	type;
365
366	if ((tophdr == NULL) || (hdr == NULL)) {
367		return;
368	}
369
370	type = ((flow_t *)hdr->objp)->type;
371
372	tophdr->nbr_items--;
373
374	if (hdr->next != NULL) {
375		hdr->next->prev = hdr->prev;
376	}
377	if (hdr->prev != NULL) {
378		hdr->prev->next = hdr->next;
379	}
380	if (tophdr->head == hdr) {
381		tophdr->head = hdr->next;
382	}
383	if (tophdr->tail == hdr) {
384		tophdr->tail = hdr->prev;
385	}
386
387	if (mode == FLOWACCT_DEL_OBJ) {
388		switch (type) {
389		case FLOWACCT_FLOW:
390			length = FLOWACCT_FLOW_SZ;
391			break;
392		case FLOWACCT_ITEM:
393			length = FLOWACCT_ITEM_SZ;
394			break;
395		}
396		kmem_free(hdr->objp, length);
397		hdr->objp = NULL;
398	}
399
400	kmem_free((void *)hdr, FLOWACCT_HDR_SZ);
401}
402
403/*
404 * Checks if the given item (identified by dsfield, project id and uid)
405 * is already present for the flow.
406 */
407static flow_item_t *
408flowacct_item_present(flow_t *flow, uint8_t dsfield, pid_t proj_id, uint_t uid)
409{
410	list_hdr_t	*itemhdr;
411	flow_item_t	*item;
412
413	itemhdr = flow->items.head;
414
415	while (itemhdr != NULL) {
416		item = (flow_item_t *)itemhdr->objp;
417
418		if ((item->dsfield != dsfield) || (item->projid != proj_id) ||
419		    (item->uid != uid)) {
420			itemhdr = itemhdr->next;
421			continue;
422		}
423		return (item);
424	}
425
426	return ((flow_item_t *)NULL);
427}
428
429/*
430 * Add the flow to the table, if not already present. If the flow is
431 * present in the table, add the item. Also, update the flow stats.
432 * Additionally, re-adjust the timout list as well.
433 */
434static int
435flowacct_update_flows_tbl(header_t *header, flowacct_data_t *flowacct_data)
436{
437	int index;
438	list_head_t *fhead;
439	list_head_t *thead;
440	list_head_t *ihead;
441	boolean_t added_flow = B_FALSE;
442	timespec_t  now;
443	flow_item_t *item;
444	flow_t *flow;
445
446	index = FLOWACCT_FLOW_HASH(header);
447	fhead = &flowacct_data->flows_tbl[index];
448
449	/* The timeout list */
450	thead = &flowacct_data->flows_tbl[FLOW_TBL_COUNT];
451
452	mutex_enter(&fhead->lock);
453	flow = flowacct_flow_present(header, index, flowacct_data);
454	if (flow == NULL) {
455		flow = (flow_t *)kmem_zalloc(FLOWACCT_FLOW_SZ, KM_NOSLEEP);
456		if (flow == NULL) {
457			flowacct0dbg(("flowacct_update_flows_tbl: mem alloc "\
458			    "error"));
459			mutex_exit(&fhead->lock);
460			return (-1);
461		}
462		flow->hdr = flowacct_add_obj(fhead, fhead->tail, (void *)flow);
463		if (flow->hdr == NULL) {
464			flowacct0dbg(("flowacct_update_flows_tbl: mem alloc "\
465			    "error"));
466			kmem_free(flow, FLOWACCT_FLOW_SZ);
467			mutex_exit(&fhead->lock);
468			return (-1);
469		}
470
471		flow->type = FLOWACCT_FLOW;
472		flow->isv4 = header->isv4;
473		bcopy(header->saddr.s6_addr32, flow->saddr.s6_addr32,
474		    sizeof (header->saddr.s6_addr32));
475		bcopy(header->daddr.s6_addr32, flow->daddr.s6_addr32,
476		    sizeof (header->daddr.s6_addr32));
477		flow->proto = header->proto;
478		flow->sport = header->sport;
479		flow->dport = header->dport;
480		flow->back_ptr = fhead;
481		added_flow = B_TRUE;
482	} else {
483		/*
484		 * We need to make sure that this 'flow' is not deleted
485		 * either by a scheduled timeout or an explict call
486		 * to flowacct_timer() below.
487		 */
488		flow->inuse = B_TRUE;
489	}
490
491	ihead = &flow->items;
492	item = flowacct_item_present(flow, header->dsfield, header->projid,
493	    header->uid);
494	if (item == NULL) {
495		boolean_t just_once = B_TRUE;
496		/*
497		 * For all practical purposes, we limit the no. of entries in
498		 * the flow table - i.e. the max_limt that a user specifies is
499		 * the maximum no. of flow items in the table.
500		 */
501	try_again:
502		atomic_add_32(&flowacct_data->nflows, 1);
503		if (flowacct_data->nflows > flowacct_data->max_limit) {
504			atomic_add_32(&flowacct_data->nflows, -1);
505
506			/* Try timing out once */
507			if (just_once) {
508				/*
509				 * Need to release the lock, as this entry
510				 * could contain a flow that can be timed
511				 * out.
512				 */
513				mutex_exit(&fhead->lock);
514				flowacct_timer(FLOWACCT_JUST_ONE,
515				    flowacct_data);
516				mutex_enter(&fhead->lock);
517				/* Lets check again */
518				just_once = B_FALSE;
519				goto try_again;
520			} else {
521				mutex_exit(&fhead->lock);
522				flowacct1dbg(("flowacct_update_flows_tbl: "\
523				    "maximum active flows exceeded\n"));
524				if (added_flow) {
525					flowacct_del_obj(fhead, flow->hdr,
526					    FLOWACCT_DEL_OBJ);
527				}
528				return (-1);
529			}
530		}
531		item = (flow_item_t *)kmem_zalloc(FLOWACCT_ITEM_SZ, KM_NOSLEEP);
532		if (item == NULL) {
533			flowacct0dbg(("flowacct_update_flows_tbl: mem alloc "\
534			    "error"));
535			/* Need to remove the flow, if one was added */
536			if (added_flow) {
537				flowacct_del_obj(fhead, flow->hdr,
538				    FLOWACCT_DEL_OBJ);
539			}
540			atomic_add_32(&flowacct_data->nflows, -1);
541			mutex_exit(&fhead->lock);
542			return (-1);
543		}
544		item->hdr = flowacct_add_obj(ihead, ihead->tail, (void *)item);
545		if (item->hdr == NULL) {
546			flowacct0dbg(("flowacct_update_flows_tbl: mem alloc "\
547			    "error\n"));
548			kmem_free(item, FLOWACCT_ITEM_SZ);
549			/* Need to remove the flow, if one was added */
550			if (added_flow) {
551				flowacct_del_obj(fhead, flow->hdr,
552				    FLOWACCT_DEL_OBJ);
553			}
554			atomic_add_32(&flowacct_data->nflows, -1);
555			mutex_exit(&fhead->lock);
556			return (-1);
557		}
558		/* If a flow was added, add it too */
559		if (added_flow) {
560			atomic_add_64(&flowacct_data->usedmem,
561			    FLOWACCT_FLOW_RECORD_SZ);
562		}
563		atomic_add_64(&flowacct_data->usedmem, FLOWACCT_ITEM_RECORD_SZ);
564
565		item->type = FLOWACCT_ITEM;
566		item->dsfield = header->dsfield;
567		item->projid = header->projid;
568		item->uid = header->uid;
569		item->npackets = 1;
570		item->nbytes = header->pktlen;
571		item->creation_time = item->hdr->last_seen;
572	} else {
573		item->npackets++;
574		item->nbytes += header->pktlen;
575	}
576	gethrestime(&now);
577	flow->hdr->last_seen = item->hdr->last_seen = now;
578	mutex_exit(&fhead->lock);
579
580	/*
581	 * Re-adjust the timeout list. The timer takes the thead lock
582	 * follwed by fhead lock(s), so we release fhead, take thead
583	 * and re-take fhead.
584	 */
585	mutex_enter(&thead->lock);
586	mutex_enter(&fhead->lock);
587	/* If the flow was added, append it to the tail of the timeout list */
588	if (added_flow) {
589		if (thead->head == NULL) {
590			thead->head = flow->hdr;
591			thead->tail = flow->hdr;
592		} else {
593			thead->tail->timeout_next = flow->hdr;
594			flow->hdr->timeout_prev = thead->tail;
595			thead->tail = flow->hdr;
596		}
597	/*
598	 * Else, move this flow to the tail of the timeout list, if it is not
599	 * already.
600	 */
601	} else if (flow->hdr != thead->tail) {
602		if (flow->hdr == thead->head) {
603			thead->head->timeout_next->timeout_prev = NULL;
604			thead->head = thead->head->timeout_next;
605			flow->hdr->timeout_next = NULL;
606			thead->tail->timeout_next = flow->hdr;
607			flow->hdr->timeout_prev = thead->tail;
608			thead->tail = flow->hdr;
609		} else {
610			flow->hdr->timeout_prev->timeout_next =
611			    flow->hdr->timeout_next;
612			flow->hdr->timeout_next->timeout_prev =
613			    flow->hdr->timeout_prev;
614			flow->hdr->timeout_next = NULL;
615			thead->tail->timeout_next = flow->hdr;
616			flow->hdr->timeout_prev = thead->tail;
617			thead->tail = flow->hdr;
618		}
619		/*
620		 * Unset this variable, now it is fine even if this
621		 * flow gets deleted (i.e. after timing out its
622		 * flow items) since we are done using it.
623		 */
624		flow->inuse = B_FALSE;
625	}
626	mutex_exit(&fhead->lock);
627	mutex_exit(&thead->lock);
628	atomic_add_64(&flowacct_data->tbytes, header->pktlen);
629	return (0);
630}
631
632/* Timer for timing out flows/items from the flow table */
633void
634flowacct_timeout_flows(void *args)
635{
636	flowacct_data_t *flowacct_data = (flowacct_data_t *)args;
637	flowacct_timer(FLOWACCT_FLOW_TIMER, flowacct_data);
638	flowacct_data->flow_tid = timeout(flowacct_timeout_flows, flowacct_data,
639	    drv_usectohz(flowacct_data->timer));
640}
641
642
643/* Delete the item from the flow in the flow table */
644static void
645flowacct_timeout_item(flow_t **flow, list_hdr_t **item_hdr)
646{
647	list_hdr_t *next_it_hdr;
648
649	next_it_hdr = (*item_hdr)->next;
650	flowacct_del_obj(&(*flow)->items, *item_hdr, FLOWACCT_DEL_OBJ);
651	*item_hdr = next_it_hdr;
652}
653
654/* Create a flow record for this timed out item */
655static flow_records_t *
656flowacct_create_record(flow_t *flow, list_hdr_t *ithdr)
657{
658	int count;
659	flow_item_t *item = (flow_item_t *)ithdr->objp;
660	flow_records_t *tmp_frec = NULL;
661
662	/* Record to be written into the accounting file */
663	tmp_frec = kmem_zalloc(sizeof (flow_records_t), KM_NOSLEEP);
664	if (tmp_frec == NULL) {
665		flowacct0dbg(("flowacct_create_record: mem alloc error.\n"));
666		return (NULL);
667	}
668	tmp_frec->fl_use = kmem_zalloc(sizeof (flow_usage_t), KM_NOSLEEP);
669	if (tmp_frec->fl_use == NULL) {
670		flowacct0dbg(("flowacct_create_record: mem alloc error\n"));
671		kmem_free(tmp_frec, sizeof (flow_records_t));
672		return (NULL);
673	}
674
675	/* Copy the IP address */
676	for (count = 0; count < 4; count++) {
677		tmp_frec->fl_use->fu_saddr[count] =
678		    htonl(flow->saddr.s6_addr32[count]);
679		tmp_frec->fl_use->fu_daddr[count] =
680		    htonl(flow->daddr.s6_addr32[count]);
681	}
682
683	/*
684	 * Ports, protocol, version, dsfield, project id, uid, nbytes, npackets
685	 * creation time and last seen.
686	 */
687	tmp_frec->fl_use->fu_sport = htons(flow->sport);
688	tmp_frec->fl_use->fu_dport = htons(flow->dport);
689	tmp_frec->fl_use->fu_protocol = flow->proto;
690	tmp_frec->fl_use->fu_isv4 = flow->isv4;
691	tmp_frec->fl_use->fu_dsfield = item->dsfield;
692	tmp_frec->fl_use->fu_projid = item->projid;
693	tmp_frec->fl_use->fu_userid = item->uid;
694	tmp_frec->fl_use->fu_nbytes = item->nbytes;
695	tmp_frec->fl_use->fu_npackets = item->npackets;
696	tmp_frec->fl_use->fu_lseen =
697	    (uint64_t)(ulong_t)ithdr->last_seen.tv_sec;
698	tmp_frec->fl_use->fu_ctime =
699	    (uint64_t)(ulong_t)item->creation_time.tv_sec;
700
701	return (tmp_frec);
702}
703
704/*
705 * Scan thru the timeout list and write the records to the accounting file, if
706 * possible. Basically step thru the timeout list maintained in the last
707 * hash bucket, FLOW_COUNT_TBL + 1, and timeout flows. This could be called
708 * from the timer, FLOWACCT_TIMER - delete only timed out flows or when this
709 * instance is deleted, FLOWACCT_PURGE_FLOW - delete all the flows from the
710 * table or as FLOWACCT_JUST_ONE - delete the first timed out flow. Since the
711 * flows are cronologically arranged in the timeout list,  when called as
712 * FLOWACCT_TIMER and FLOWACCT_JUST_ONE, we can stop when we come across
713 * the first flow that has not timed out (which means none of the following
714 * flows would have timed out).
715 */
716void
717flowacct_timer(int type, flowacct_data_t *flowacct_data)
718{
719	hrtime_t diff;
720	timespec_t now;
721	list_head_t *head, *thead;
722	flow_t *flow;
723	flow_item_t *item;
724	list_hdr_t *fl_hdr, *next_fl_hdr;
725	list_hdr_t *ithdr = (list_hdr_t *)NULL;
726	flow_records_t *frec = NULL, *tmp_frec, *tail;
727	uint64_t flow_size;
728	uint64_t item_size;
729
730	ASSERT(flowacct_data != NULL);
731
732	/* 2s-complement for subtraction */
733	flow_size = ~FLOWACCT_FLOW_RECORD_SZ + 1;
734	item_size = ~FLOWACCT_ITEM_RECORD_SZ + 1;
735
736	/* Get the current time */
737	gethrestime(&now);
738
739	/*
740	 * For each flow in the table, scan thru all the items and delete
741	 * those that have exceeded the timeout. If all the items in a
742	 * flow have timed out, delete the flow entry as well. Finally,
743	 * write all the delted items to the accounting file.
744	 */
745	thead = &flowacct_data->flows_tbl[FLOW_TBL_COUNT];
746
747	mutex_enter(&thead->lock);
748	fl_hdr = thead->head;
749	while (fl_hdr != NULL) {
750		uint32_t	items_deleted = 0;
751
752		next_fl_hdr = fl_hdr->timeout_next;
753		flow = (flow_t *)fl_hdr->objp;
754		head = flow->back_ptr;
755		mutex_enter(&head->lock);
756
757		/*LINTED*/
758		FLOWACCT_DELTA(now, fl_hdr->last_seen, diff);
759
760		/*
761		 * If type is FLOW_TIMER, then check if the item has timed out.
762		 * If type is FLOW_PURGE delete the entry anyways.
763		 */
764		if ((type != FLOWACCT_PURGE_FLOW) &&
765		    (diff < flowacct_data->timeout)) {
766			mutex_exit(&head->lock);
767			mutex_exit(&thead->lock);
768			goto write_records;
769		}
770
771		ithdr = flow->items.head;
772		while (ithdr != NULL) {
773			item = (flow_item_t *)ithdr->objp;
774			/*
775			 * Fill in the flow record to be
776			 * written to the accounting file.
777			 */
778			tmp_frec = flowacct_create_record(flow, ithdr);
779			/*
780			 * If we don't have memory for records,
781			 * we will come back in case this is
782			 * called as FLOW_TIMER, else we will
783			 * go ahead and delete the item from
784			 * the table (when asked to PURGE the
785			 * table), so there could be some
786			 * entries not written to the file
787			 * when this action instance is
788			 * deleted.
789			 */
790			if (tmp_frec != NULL) {
791				tmp_frec->fl_use->fu_aname =
792				    flowacct_data->act_name;
793				if (frec == NULL) {
794					frec = tmp_frec;
795					tail = frec;
796				} else {
797					tail->next = tmp_frec;
798					tail = tmp_frec;
799				}
800			} else if (type != FLOWACCT_PURGE_FLOW) {
801				mutex_exit(&head->lock);
802				mutex_exit(&thead->lock);
803				atomic_add_32(&flowacct_data->nflows,
804				    (~items_deleted + 1));
805				goto write_records;
806			}
807
808			/* Update stats */
809			atomic_add_64(&flowacct_data->tbytes, (~item->nbytes +
810			    1));
811
812			/* Delete the item */
813			flowacct_timeout_item(&flow, &ithdr);
814			items_deleted++;
815			atomic_add_64(&flowacct_data->usedmem, item_size);
816		}
817		ASSERT(flow->items.nbr_items == 0);
818		atomic_add_32(&flowacct_data->nflows, (~items_deleted + 1));
819
820		/*
821		 * Don't delete this flow if we are making place for
822		 * a new item for this flow.
823		 */
824		if (!flow->inuse) {
825			if (fl_hdr == thead->tail) {
826				thead->head = thead->tail = NULL;
827			} else {
828				thead->head = fl_hdr->timeout_next;
829				thead->head->timeout_prev = NULL;
830			}
831			flowacct_del_obj(head, fl_hdr, FLOWACCT_DEL_OBJ);
832			atomic_add_64(&flowacct_data->usedmem, flow_size);
833		}
834		mutex_exit(&head->lock);
835		if (type == FLOWACCT_JUST_ONE) {
836			mutex_exit(&thead->lock);
837			goto write_records;
838		}
839		fl_hdr = next_fl_hdr;
840	}
841	mutex_exit(&thead->lock);
842write_records:
843	/* Write all the timed out flows to the accounting file */
844	while (frec != NULL) {
845		tmp_frec = frec->next;
846		exacct_commit_flow(frec->fl_use);
847		kmem_free(frec->fl_use, sizeof (flow_usage_t));
848		kmem_free(frec, sizeof (flow_records_t));
849		frec = tmp_frec;
850	}
851}
852
853/*
854 * Get the IP header contents from the packet, update the flow table with
855 * this item and return.
856 */
857int
858flowacct_process(mblk_t **mpp, flowacct_data_t *flowacct_data)
859{
860	header_t *header;
861	mblk_t *mp = *mpp;
862
863	ASSERT(mp != NULL);
864
865	/* If we don't find an M_DATA, return error */
866	if (mp->b_datap->db_type != M_DATA) {
867		if ((mp->b_cont != NULL) &&
868		    (mp->b_cont->b_datap->db_type == M_DATA)) {
869			mp = mp->b_cont;
870		} else {
871			flowacct0dbg(("flowacct_process: no data\n"));
872			atomic_add_64(&flowacct_data->epackets, 1);
873			return (EINVAL);
874		}
875	}
876
877	header = kmem_zalloc(FLOWACCT_HEADER_SZ, KM_NOSLEEP);
878	if (header == NULL) {
879		flowacct0dbg(("flowacct_process: error allocing mem"));
880		atomic_add_64(&flowacct_data->epackets, 1);
881		return (ENOMEM);
882	}
883
884	/* Get all the required information into header. */
885	if (flowacct_extract_header(mp, header) != 0) {
886		kmem_free(header, FLOWACCT_HEADER_SZ);
887		atomic_add_64(&flowacct_data->epackets, 1);
888		return (EINVAL);
889	}
890
891	/* Updated the flow table with this entry */
892	if (flowacct_update_flows_tbl(header, flowacct_data) != 0) {
893		kmem_free(header, FLOWACCT_HEADER_SZ);
894		atomic_add_64(&flowacct_data->epackets, 1);
895		return (ENOMEM);
896	}
897
898	/* Update global stats */
899	atomic_add_64(&flowacct_data->npackets, 1);
900	atomic_add_64(&flowacct_data->nbytes, header->pktlen);
901
902	kmem_free(header, FLOWACCT_HEADER_SZ);
903	if (flowacct_data->flow_tid == 0) {
904		flowacct_data->flow_tid = timeout(flowacct_timeout_flows,
905		    flowacct_data, drv_usectohz(flowacct_data->timer));
906	}
907	return (0);
908}
909