unbound_munin_ revision 356345
1#!/bin/sh
2#
3# plugin for munin to monitor usage of unbound servers.
4# To install copy this to /usr/local/share/munin/plugins/unbound_munin_
5# and use munin-node-configure (--suggest, --shell).
6#
7# (C) 2008 W.C.A. Wijngaards.  BSD Licensed.
8#
9# To install; enable statistics and unbound-control in unbound.conf
10#	server:		extended-statistics: yes
11#			statistics-cumulative: no
12#			statistics-interval: 0
13#	remote-control:	control-enable: yes
14# Run the command unbound-control-setup to generate the key files.
15#
16# Environment variables for this script
17#	statefile	- where to put temporary statefile.
18#	unbound_conf	- where the unbound.conf file is located.
19#	unbound_control	- where to find unbound-control executable.
20#	spoof_warn	- what level to warn about spoofing
21#	spoof_crit	- what level to crit about spoofing
22#
23# You can set them in your munin/plugin-conf.d/plugins.conf file
24# with:
25# [unbound*]
26# user root
27# env.statefile /usr/local/var/munin/plugin-state/unbound-state
28# env.unbound_conf /usr/local/etc/unbound/unbound.conf
29# env.unbound_control /usr/local/sbin/unbound-control
30# env.spoof_warn 1000
31# env.spoof_crit 100000
32#
33# This plugin can create different graphs depending on what name
34# you link it as (with ln -s) into the plugins directory
35# You can link it multiple times.
36# If you are only a casual user, the _hits and _by_type are most interesting,
37# possibly followed by _by_rcode.
38#
39#	unbound_munin_hits	- base volume, cache hits, unwanted traffic
40#	unbound_munin_queue	- to monitor the internal requestlist
41#	unbound_munin_memory	- memory usage
42#	unbound_munin_by_type	- incoming queries by type
43#	unbound_munin_by_class	- incoming queries by class
44#	unbound_munin_by_opcode	- incoming queries by opcode
45#	unbound_munin_by_rcode	- answers by rcode, validation status
46#	unbound_munin_by_flags	- incoming queries by flags
47#	unbound_munin_histogram	- histogram of query resolving times
48#
49# Magic markers - optional - used by installation scripts and
50# munin-config:  (originally contrib family but munin-node-configure ignores it)
51#
52#%# family=auto
53#%# capabilities=autoconf suggest
54
55# POD documentation
56: <<=cut
57=head1 NAME
58
59unbound_munin_ - Munin plugin to monitor the Unbound DNS resolver.
60
61=head1 APPLICABLE SYSTEMS
62
63System with unbound daemon.
64
65=head1 CONFIGURATION
66
67  [unbound*]
68  user root
69  env.statefile /usr/local/var/munin/plugin-state/unbound-state
70  env.unbound_conf /usr/local/etc/unbound/unbound.conf
71  env.unbound_control /usr/local/sbin/unbound-control
72  env.spoof_warn 1000
73  env.spoof_crit 100000
74
75Use the .env settings to override the defaults.
76
77=head1 USAGE
78
79Can be used to present different graphs. Use ln -s for that name in
80the plugins directory to enable the graph.
81unbound_munin_hits	- base volume, cache hits, unwanted traffic
82unbound_munin_queue	- to monitor the internal requestlist
83unbound_munin_memory	- memory usage
84unbound_munin_by_type	- incoming queries by type
85unbound_munin_by_class	- incoming queries by class
86unbound_munin_by_opcode	- incoming queries by opcode
87unbound_munin_by_rcode	- answers by rcode, validation status
88unbound_munin_by_flags	- incoming queries by flags
89unbound_munin_histogram - histogram of query resolving times
90
91=head1 AUTHOR
92
93Copyright 2008 W.C.A. Wijngaards
94
95=head1 LICENSE
96
97BSD
98
99=cut
100
101state=${statefile:-/usr/local/var/munin/plugin-state/unbound-state}
102conf=${unbound_conf:-/usr/local/etc/unbound/unbound.conf}
103ctrl=${unbound_control:-/usr/local/sbin/unbound-control}
104warn=${spoof_warn:-1000}
105crit=${spoof_crit:-100000}
106lock=$state.lock
107
108# number of seconds between polling attempts.
109# makes the statefile hang around for at least this many seconds,
110# so that multiple links of this script can share the results.
111lee=55
112
113# to keep things within 19 characters
114ABBREV="-e s/total/t/ -e s/thread/t/ -e s/num/n/ -e s/query/q/ -e s/answer/a/ -e s/unwanted/u/ -e s/requestlist/ql/ -e s/type/t/ -e s/class/c/ -e s/opcode/o/ -e s/rcode/r/ -e s/edns/e/ -e s/mem/m/ -e s/cache/c/ -e s/mod/m/"
115
116# get value from $1 into return variable $value
117get_value ( ) {
118	value="`grep '^'$1'=' $state | sed -e 's/^.*=//'`"
119	if test "$value"x = ""x; then
120		value="0"
121	fi
122}
123
124# download the state from the unbound server.
125get_state ( ) {
126	# obtain lock for fetching the state
127	# because there is a race condition in fetching and writing to file
128
129	# see if the lock is stale, if so, take it 
130	if test -f $lock ; then
131		pid="`cat $lock 2>&1`"
132		kill -0 "$pid" >/dev/null 2>&1
133		if test $? -ne 0 -a "$pid" != $$ ; then
134			echo $$ >$lock
135		fi
136	fi
137
138	i=0
139	while test ! -f $lock || test "`cat $lock 2>&1`" != $$; do
140		while test -f $lock; do
141			# wait
142			i=`expr $i + 1`
143			if test $i -gt 1000; then
144				sleep 1;
145			fi
146			if test $i -gt 1500; then
147				echo "error locking $lock" "=" `cat $lock`
148				rm -f $lock
149				exit 1
150			fi
151		done
152		# try to get it
153		if echo $$ >$lock ; then : ; else break; fi
154	done
155	# do not refetch if the file exists and only LEE seconds old
156	if test -f $state; then
157		now=`date +%s`
158		get_value "time.now"
159		value="`echo $value | sed -e 's/\..*$//'`"
160		if test $now -lt `expr $value + $lee`; then
161			rm -f $lock
162			return
163		fi
164	fi
165	$ctrl -c $conf stats > $state
166	if test $? -ne 0; then
167		echo "error retrieving data from unbound server"
168		rm -f $lock
169		exit 1
170	fi
171	rm -f $lock
172}
173
174if test "$1" = "autoconf" ; then
175	if test ! -f $conf; then
176		echo no "($conf does not exist)"
177		exit 1
178	fi
179	if test ! -d `dirname $state`; then
180		echo no "(`dirname $state` directory does not exist)"
181		exit 1
182	fi
183	echo yes
184	exit 0
185fi
186
187if test "$1" = "suggest" ; then
188	echo "hits"
189	echo "queue"
190	echo "memory"
191	echo "by_type"
192	echo "by_class"
193	echo "by_opcode"
194	echo "by_rcode"
195	echo "by_flags"
196	echo "histogram"
197	exit 0
198fi
199
200# determine my type, by name
201id=`echo $0 | sed -e 's/^.*unbound_munin_//'`
202if test "$id"x = ""x; then
203	# some default to keep people sane.
204	id="hits"
205fi
206
207# if $1 exists in statefile, config is echoed with label $2
208exist_config ( ) {
209	mn=`echo $1 | sed $ABBREV | tr . _`
210	if grep '^'$1'=' $state >/dev/null 2>&1; then
211		echo "$mn.label $2"
212		echo "$mn.min 0"
213		echo "$mn.type ABSOLUTE"
214	fi
215}
216
217# print label and min 0 for a name $1 in unbound format
218p_config ( ) {
219	mn=`echo $1 | sed $ABBREV | tr . _`
220	echo $mn.label "$2"
221	echo $mn.min 0
222	echo $mn.type $3
223}
224
225if test "$1" = "config" ; then
226	if test ! -f $state; then
227		get_state
228	fi
229	case $id in
230	hits)
231		echo "graph_title Unbound DNS traffic and cache hits"
232		echo "graph_args --base 1000 -l 0"
233		echo "graph_vlabel queries / \${graph_period}"
234		echo "graph_scale no"
235		echo "graph_category DNS"
236		for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
237			sed -e 's/=.*//'`; do
238			exist_config $x "queries handled by `basename $x .num.queries`"
239		done
240		p_config "total.num.queries" "total queries from clients" "ABSOLUTE"
241		p_config "total.num.cachehits" "cache hits" "ABSOLUTE"
242		p_config "total.num.prefetch" "cache prefetch" "ABSOLUTE"
243		p_config "num.query.tcp" "TCP queries" "ABSOLUTE"
244		p_config "num.query.tcpout" "TCP out queries" "ABSOLUTE"
245		p_config "num.query.ipv6" "IPv6 queries" "ABSOLUTE"
246		p_config "unwanted.queries" "queries that failed acl" "ABSOLUTE"
247		p_config "unwanted.replies" "unwanted or unsolicited replies" "ABSOLUTE"
248		echo "u_replies.warning $warn"
249		echo "u_replies.critical $crit"
250		echo "graph_info DNS queries to the recursive resolver. The unwanted replies could be innocent duplicate packets, late replies, or spoof threats."
251		;;
252	queue)
253		echo "graph_title Unbound requestlist size"
254		echo "graph_args --base 1000 -l 0"
255		echo "graph_vlabel number of queries"
256		echo "graph_scale no"
257		echo "graph_category DNS"
258		p_config "total.requestlist.avg" "Average size of queue on insert" "GAUGE"
259		p_config "total.requestlist.max" "Max size of queue (in 5 min)" "GAUGE"
260		p_config "total.requestlist.overwritten" "Number of queries replaced by new ones" "GAUGE"
261		p_config "total.requestlist.exceeded" "Number of queries dropped due to lack of space" "GAUGE"
262		echo "graph_info The queries that did not hit the cache and need recursion service take up space in the requestlist. If there are too many queries, first queries get overwritten, and at last resort dropped."
263		;;
264	memory)
265		echo "graph_title Unbound memory usage"
266		echo "graph_args --base 1024 -l 0"
267		echo "graph_vlabel memory used in bytes"
268		echo "graph_category DNS"
269		p_config "mem.cache.rrset" "RRset cache memory" "GAUGE"
270		p_config "mem.cache.message" "Message cache memory" "GAUGE"
271		p_config "mem.mod.iterator" "Iterator module memory" "GAUGE"
272		p_config "mem.mod.validator" "Validator module and key cache memory" "GAUGE"
273		p_config "msg.cache.count" "msg cache count" "GAUGE"
274		p_config "rrset.cache.count" "rrset cache count" "GAUGE"
275		p_config "infra.cache.count" "infra cache count" "GAUGE"
276		p_config "key.cache.count" "key cache count" "GAUGE"
277		echo "graph_info The memory used by unbound."
278		;;
279	by_type)
280		echo "graph_title Unbound DNS queries by type"
281		echo "graph_args --base 1000 -l 0"
282		echo "graph_vlabel queries / \${graph_period}"
283		echo "graph_scale no"
284		echo "graph_category DNS"
285		for x in `grep "^num.query.type" $state`; do
286			nm=`echo $x | sed -e 's/=.*$//'`
287			tp=`echo $nm | sed -e s/num.query.type.//`
288			p_config "$nm" "$tp" "ABSOLUTE"
289		done
290		echo "graph_info queries by DNS RR type queried for"
291		;;
292	by_class)
293		echo "graph_title Unbound DNS queries by class"
294		echo "graph_args --base 1000 -l 0"
295		echo "graph_vlabel queries / \${graph_period}"
296		echo "graph_scale no"
297		echo "graph_category DNS"
298		for x in `grep "^num.query.class" $state`; do
299			nm=`echo $x | sed -e 's/=.*$//'`
300			tp=`echo $nm | sed -e s/num.query.class.//`
301			p_config "$nm" "$tp" "ABSOLUTE"
302		done
303		echo "graph_info queries by DNS RR class queried for."
304		;;
305	by_opcode)
306		echo "graph_title Unbound DNS queries by opcode"
307		echo "graph_args --base 1000 -l 0"
308		echo "graph_vlabel queries / \${graph_period}"
309		echo "graph_scale no"
310		echo "graph_category DNS"
311		for x in `grep "^num.query.opcode" $state`; do
312			nm=`echo $x | sed -e 's/=.*$//'`
313			tp=`echo $nm | sed -e s/num.query.opcode.//`
314			p_config "$nm" "$tp" "ABSOLUTE"
315		done
316		echo "graph_info queries by opcode in the query packet."
317		;;
318	by_rcode)
319		echo "graph_title Unbound DNS answers by return code"
320		echo "graph_args --base 1000 -l 0"
321		echo "graph_vlabel answer packets / \${graph_period}"
322		echo "graph_scale no"
323		echo "graph_category DNS"
324		for x in `grep "^num.answer.rcode" $state`; do
325			nm=`echo $x | sed -e 's/=.*$//'`
326			tp=`echo $nm | sed -e s/num.answer.rcode.//`
327			p_config "$nm" "$tp" "ABSOLUTE"
328		done
329		p_config "num.answer.secure" "answer secure" "ABSOLUTE"
330		p_config "num.answer.bogus" "answer bogus" "ABSOLUTE"
331		p_config "num.rrset.bogus" "num rrsets marked bogus" "ABSOLUTE"
332		echo "graph_info answers sorted by return value. rrsets bogus is the number of rrsets marked bogus per \${graph_period} by the validator"
333		;;
334	by_flags)
335		echo "graph_title Unbound DNS incoming queries by flags"
336		echo "graph_args --base 1000 -l 0"
337		echo "graph_vlabel queries / \${graph_period}"
338		echo "graph_scale no"
339		echo "graph_category DNS"
340		p_config "num.query.flags.QR" "QR (query reply) flag" "ABSOLUTE"
341		p_config "num.query.flags.AA" "AA (auth answer) flag" "ABSOLUTE"
342		p_config "num.query.flags.TC" "TC (truncated) flag" "ABSOLUTE"
343		p_config "num.query.flags.RD" "RD (recursion desired) flag" "ABSOLUTE"
344		p_config "num.query.flags.RA" "RA (rec avail) flag" "ABSOLUTE"
345		p_config "num.query.flags.Z" "Z (zero) flag" "ABSOLUTE"
346		p_config "num.query.flags.AD" "AD (auth data) flag" "ABSOLUTE"
347		p_config "num.query.flags.CD" "CD (check disabled) flag" "ABSOLUTE"
348		p_config "num.query.edns.present" "EDNS OPT present" "ABSOLUTE"
349		p_config "num.query.edns.DO" "DO (DNSSEC OK) flag" "ABSOLUTE"
350		echo "graph_info This graphs plots the flags inside incoming queries. For example, if QR, AA, TC, RA, Z flags are set, the query can be rejected. RD, AD, CD and DO are legitimately set by some software."
351		;;
352	histogram)
353		echo "graph_title Unbound DNS histogram of reply time"
354		echo "graph_args --base 1000 -l 0"
355		echo "graph_vlabel queries / \${graph_period}"
356		echo "graph_scale no"
357		echo "graph_category DNS"
358		echo hcache.label "cache hits"
359		echo hcache.min 0
360		echo hcache.type ABSOLUTE
361		echo hcache.draw AREA
362		echo hcache.colour 999999
363		echo h64ms.label "0 msec - 66 msec"
364		echo h64ms.min 0
365		echo h64ms.type ABSOLUTE
366		echo h64ms.draw STACK
367		echo h64ms.colour 0000FF
368		echo h128ms.label "66 msec - 131 msec"
369		echo h128ms.min 0
370		echo h128ms.type ABSOLUTE
371		echo h128ms.colour 1F00DF
372		echo h128ms.draw STACK
373		echo h256ms.label "131 msec - 262 msec"
374		echo h256ms.min 0
375		echo h256ms.type ABSOLUTE
376		echo h256ms.draw STACK
377		echo h256ms.colour 3F00BF
378		echo h512ms.label "262 msec - 524 msec"
379		echo h512ms.min 0
380		echo h512ms.type ABSOLUTE
381		echo h512ms.draw STACK
382		echo h512ms.colour 5F009F
383		echo h1s.label "524 msec - 1 sec"
384		echo h1s.min 0
385		echo h1s.type ABSOLUTE
386		echo h1s.draw STACK
387		echo h1s.colour 7F007F
388		echo h2s.label "1 sec - 2 sec"
389		echo h2s.min 0
390		echo h2s.type ABSOLUTE
391		echo h2s.draw STACK
392		echo h2s.colour 9F005F
393		echo h4s.label "2 sec - 4 sec"
394		echo h4s.min 0
395		echo h4s.type ABSOLUTE
396		echo h4s.draw STACK
397		echo h4s.colour BF003F
398		echo h8s.label "4 sec - 8 sec"
399		echo h8s.min 0
400		echo h8s.type ABSOLUTE
401		echo h8s.draw STACK
402		echo h8s.colour DF001F
403		echo h16s.label "8 sec - ..."
404		echo h16s.min 0
405		echo h16s.type ABSOLUTE
406		echo h16s.draw STACK
407		echo h16s.colour FF0000
408		echo "graph_info Histogram of the reply times for queries."
409		;;
410	esac
411
412	exit 0
413fi
414
415# do the stats itself
416get_state
417
418# get the time elapsed
419get_value "time.elapsed"
420if test $value = 0 || test $value = "0.000000"; then
421	echo "error: time elapsed 0 or could not retrieve data"
422	exit 1
423fi
424elapsed="$value"
425
426# print value for $1
427print_value ( ) {
428	mn=`echo $1 | sed $ABBREV | tr . _`
429	get_value $1
430	echo "$mn.value" $value
431}
432
433# print value if line already found in $2
434print_value_line ( ) {
435	mn=`echo $1 | sed $ABBREV | tr . _`
436	value="`echo $2 | sed -e 's/^.*=//'`"
437	echo "$mn.value" $value
438}
439
440
441case $id in
442hits)
443	for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
444		sed -e 's/=.*//'` total.num.queries \
445		total.num.cachehits total.num.prefetch num.query.tcp \
446		num.query.tcpout num.query.ipv6 unwanted.queries \
447		unwanted.replies; do
448		if grep "^"$x"=" $state >/dev/null 2>&1; then
449			print_value $x
450		fi
451	done
452	;;
453queue)
454	for x in total.requestlist.avg total.requestlist.max \
455		total.requestlist.overwritten total.requestlist.exceeded; do
456		print_value $x
457	done
458	;;
459memory)
460	for x in mem.cache.rrset mem.cache.message mem.mod.iterator \
461		mem.mod.validator msg.cache.count rrset.cache.count \
462		infra.cache.count key.cache.count; do
463		print_value $x
464	done
465	;;
466by_type)
467	for x in `grep "^num.query.type" $state`; do
468		nm=`echo $x | sed -e 's/=.*$//'`
469		print_value_line $nm $x
470	done
471	;;
472by_class)
473	for x in `grep "^num.query.class" $state`; do
474		nm=`echo $x | sed -e 's/=.*$//'`
475		print_value_line $nm $x
476	done
477	;;
478by_opcode)
479	for x in `grep "^num.query.opcode" $state`; do
480		nm=`echo $x | sed -e 's/=.*$//'`
481		print_value_line $nm $x
482	done
483	;;
484by_rcode)
485	for x in `grep "^num.answer.rcode" $state`; do
486		nm=`echo $x | sed -e 's/=.*$//'`
487		print_value_line $nm $x
488	done
489	print_value "num.answer.secure"
490	print_value "num.answer.bogus"
491	print_value "num.rrset.bogus"
492	;;
493by_flags)
494	for x in num.query.flags.QR num.query.flags.AA num.query.flags.TC num.query.flags.RD num.query.flags.RA num.query.flags.Z num.query.flags.AD num.query.flags.CD num.query.edns.present num.query.edns.DO; do
495		print_value $x
496	done
497	;;
498histogram)
499	get_value total.num.cachehits
500	echo hcache.value $value
501	r=0
502	for x in histogram.000000.000000.to.000000.000001 \
503		histogram.000000.000001.to.000000.000002 \
504		histogram.000000.000002.to.000000.000004 \
505		histogram.000000.000004.to.000000.000008 \
506		histogram.000000.000008.to.000000.000016 \
507		histogram.000000.000016.to.000000.000032 \
508		histogram.000000.000032.to.000000.000064 \
509		histogram.000000.000064.to.000000.000128 \
510		histogram.000000.000128.to.000000.000256 \
511		histogram.000000.000256.to.000000.000512 \
512		histogram.000000.000512.to.000000.001024 \
513		histogram.000000.001024.to.000000.002048 \
514		histogram.000000.002048.to.000000.004096 \
515		histogram.000000.004096.to.000000.008192 \
516		histogram.000000.008192.to.000000.016384 \
517		histogram.000000.016384.to.000000.032768 \
518		histogram.000000.032768.to.000000.065536; do
519		get_value $x
520		r=`expr $r + $value`
521	done
522	echo h64ms.value $r
523	get_value histogram.000000.065536.to.000000.131072
524	echo h128ms.value $value
525	get_value histogram.000000.131072.to.000000.262144
526	echo h256ms.value $value
527	get_value histogram.000000.262144.to.000000.524288
528	echo h512ms.value $value
529	get_value histogram.000000.524288.to.000001.000000
530	echo h1s.value $value
531	get_value histogram.000001.000000.to.000002.000000
532	echo h2s.value $value
533	get_value histogram.000002.000000.to.000004.000000
534	echo h4s.value $value
535	get_value histogram.000004.000000.to.000008.000000
536	echo h8s.value $value
537	r=0
538	for x in histogram.000008.000000.to.000016.000000 \
539		histogram.000016.000000.to.000032.000000 \
540		histogram.000032.000000.to.000064.000000 \
541		histogram.000064.000000.to.000128.000000 \
542		histogram.000128.000000.to.000256.000000 \
543		histogram.000256.000000.to.000512.000000 \
544		histogram.000512.000000.to.001024.000000 \
545		histogram.001024.000000.to.002048.000000 \
546		histogram.002048.000000.to.004096.000000 \
547		histogram.004096.000000.to.008192.000000 \
548		histogram.008192.000000.to.016384.000000 \
549		histogram.016384.000000.to.032768.000000 \
550		histogram.032768.000000.to.065536.000000 \
551		histogram.065536.000000.to.131072.000000 \
552		histogram.131072.000000.to.262144.000000 \
553		histogram.262144.000000.to.524288.000000; do
554		get_value $x
555		r=`expr $r + $value`
556	done
557	echo h16s.value $r
558	;;
559esac
560