unbound_munin_ revision 361435
1#!/bin/sh
2#
3# plugin for munin to monitor usage of unbound servers.
4# To install copy this to /usr/local/share/munin/plugins/unbound_munin_
5# and use munin-node-configure (--suggest, --shell).
6#
7# (C) 2008 W.C.A. Wijngaards.  BSD Licensed.
8#
9# To install; enable statistics and unbound-control in unbound.conf
10#	server:		extended-statistics: yes
11#			statistics-cumulative: no
12#			statistics-interval: 0
13#	remote-control:	control-enable: yes
14# Run the command unbound-control-setup to generate the key files.
15#
16# Environment variables for this script
17#	statefile	- where to put temporary statefile.
18#	unbound_conf	- where the unbound.conf file is located.
19#	unbound_control	- where to find unbound-control executable.
20#	spoof_warn	- what level to warn about spoofing
21#	spoof_crit	- what level to crit about spoofing
22#
23# You can set them in your munin/plugin-conf.d/plugins.conf file
24# with:
25# [unbound*]
26# user root
27# env.statefile /usr/local/var/munin/plugin-state/unbound-state
28# env.unbound_conf /usr/local/etc/unbound/unbound.conf
29# env.unbound_control /usr/local/sbin/unbound-control
30# env.spoof_warn 1000
31# env.spoof_crit 100000
32#
33# This plugin can create different graphs depending on what name
34# you link it as (with ln -s) into the plugins directory
35# You can link it multiple times.
36# If you are only a casual user, the _hits and _by_type are most interesting,
37# possibly followed by _by_rcode.
38#
39#	unbound_munin_hits	- base volume, cache hits, unwanted traffic
40#	unbound_munin_queue	- to monitor the internal requestlist
41#	unbound_munin_memory	- memory usage
42#	unbound_munin_by_type	- incoming queries by type
43#	unbound_munin_by_class	- incoming queries by class
44#	unbound_munin_by_opcode	- incoming queries by opcode
45#	unbound_munin_by_rcode	- answers by rcode, validation status
46#	unbound_munin_by_flags	- incoming queries by flags
47#	unbound_munin_histogram	- histogram of query resolving times
48#
49# Magic markers - optional - used by installation scripts and
50# munin-config:  (originally contrib family but munin-node-configure ignores it)
51#
52#%# family=auto
53#%# capabilities=autoconf suggest
54
55# POD documentation
56: <<=cut
57=head1 NAME
58
59unbound_munin_ - Munin plugin to monitor the Unbound DNS resolver.
60
61=head1 APPLICABLE SYSTEMS
62
63System with unbound daemon.
64
65=head1 CONFIGURATION
66
67  [unbound*]
68  user root
69  env.statefile /usr/local/var/munin/plugin-state/unbound-state
70  env.unbound_conf /usr/local/etc/unbound/unbound.conf
71  env.unbound_control /usr/local/sbin/unbound-control
72  env.spoof_warn 1000
73  env.spoof_crit 100000
74
75Use the .env settings to override the defaults.
76
77=head1 USAGE
78
79Can be used to present different graphs. Use ln -s for that name in
80the plugins directory to enable the graph.
81unbound_munin_hits	- base volume, cache hits, unwanted traffic
82unbound_munin_queue	- to monitor the internal requestlist
83unbound_munin_memory	- memory usage
84unbound_munin_by_type	- incoming queries by type
85unbound_munin_by_class	- incoming queries by class
86unbound_munin_by_opcode	- incoming queries by opcode
87unbound_munin_by_rcode	- answers by rcode, validation status
88unbound_munin_by_flags	- incoming queries by flags
89unbound_munin_histogram - histogram of query resolving times
90
91=head1 AUTHOR
92
93Copyright 2008 W.C.A. Wijngaards
94
95=head1 LICENSE
96
97BSD
98
99=cut
100
101state=${statefile:-/usr/local/var/munin/plugin-state/unbound-state}
102conf=${unbound_conf:-/usr/local/etc/unbound/unbound.conf}
103ctrl=${unbound_control:-/usr/local/sbin/unbound-control}
104warn=${spoof_warn:-1000}
105crit=${spoof_crit:-100000}
106lock=$state.lock
107
108# number of seconds between polling attempts.
109# makes the statefile hang around for at least this many seconds,
110# so that multiple links of this script can share the results.
111lee=55
112
113# to keep things within 19 characters
114ABBREV="-e s/total/t/ -e s/thread/t/ -e s/num/n/ -e s/query/q/ -e s/answer/a/ -e s/unwanted/u/ -e s/requestlist/ql/ -e s/type/t/ -e s/class/c/ -e s/opcode/o/ -e s/rcode/r/ -e s/edns/e/ -e s/mem/m/ -e s/cache/c/ -e s/mod/m/"
115
116# get value from $1 into return variable $value
117get_value ( ) {
118	value="`grep '^'$1'=' $state | sed -e 's/^.*=//'`"
119	if test "$value"x = ""x; then
120		value="0"
121	fi
122}
123
124# download the state from the unbound server.
125get_state ( ) {
126	# obtain lock for fetching the state
127	# because there is a race condition in fetching and writing to file
128
129	# see if the lock is stale, if so, take it 
130	if test -f $lock ; then
131		pid="`cat $lock 2>&1`"
132		kill -0 "$pid" >/dev/null 2>&1
133		if test $? -ne 0 -a "$pid" != $$ ; then
134			echo $$ >$lock
135		fi
136	fi
137
138	i=0
139	while test ! -f $lock || test "`cat $lock 2>&1`" != $$; do
140		while test -f $lock; do
141			# wait
142			i=`expr $i + 1`
143			if test $i -gt 1000; then
144				sleep 1;
145			fi
146			if test $i -gt 1500; then
147				echo "error locking $lock" "=" `cat $lock`
148				rm -f $lock
149				exit 1
150			fi
151		done
152		# try to get it
153		if echo $$ >$lock ; then : ; else break; fi
154	done
155	# do not refetch if the file exists and only LEE seconds old
156	if test -f $state; then
157		now=`date +%s`
158		get_value "time.now"
159		value="`echo $value | sed -e 's/\..*$//'`"
160		if test $now -lt `expr $value + $lee`; then
161			rm -f $lock
162			return
163		fi
164	fi
165	$ctrl -c $conf stats > $state
166	if test $? -ne 0; then
167		echo "error retrieving data from unbound server"
168		rm -f $lock
169		exit 1
170	fi
171	rm -f $lock
172}
173
174if test "$1" = "autoconf" ; then
175	if test ! -f $conf; then
176		echo no "($conf does not exist)"
177		exit 1
178	fi
179	if test ! -d `dirname $state`; then
180		echo no "(`dirname $state` directory does not exist)"
181		exit 1
182	fi
183	echo yes
184	exit 0
185fi
186
187if test "$1" = "suggest" ; then
188	echo "hits"
189	echo "queue"
190	echo "memory"
191	echo "by_type"
192	echo "by_class"
193	echo "by_opcode"
194	echo "by_rcode"
195	echo "by_flags"
196	echo "histogram"
197	exit 0
198fi
199
200# determine my type, by name
201id=`echo $0 | sed -e 's/^.*unbound_munin_//'`
202if test "$id"x = ""x; then
203	# some default to keep people sane.
204	id="hits"
205fi
206
207# if $1 exists in statefile, config is echoed with label $2
208exist_config ( ) {
209	mn=`echo $1 | sed $ABBREV | tr . _`
210	if grep '^'$1'=' $state >/dev/null 2>&1; then
211		echo "$mn.label $2"
212		echo "$mn.min 0"
213		echo "$mn.type ABSOLUTE"
214	fi
215}
216
217# print label and min 0 for a name $1 in unbound format
218p_config ( ) {
219	mn=`echo $1 | sed $ABBREV | tr . _`
220	echo $mn.label "$2"
221	echo $mn.min 0
222	echo $mn.type $3
223}
224
225if test "$1" = "config" ; then
226	if test ! -f $state; then
227		get_state
228	fi
229	case $id in
230	hits)
231		echo "graph_title Unbound DNS traffic and cache hits"
232		echo "graph_args --base 1000 -l 0"
233		echo "graph_vlabel queries / \${graph_period}"
234		echo "graph_scale no"
235		echo "graph_category DNS"
236		for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
237			sed -e 's/=.*//'`; do
238			exist_config $x "queries handled by `basename $x .num.queries`"
239		done
240		p_config "total.num.queries" "total queries from clients" "ABSOLUTE"
241		p_config "total.num.cachehits" "cache hits" "ABSOLUTE"
242		p_config "total.num.prefetch" "cache prefetch" "ABSOLUTE"
243		p_config "num.query.tcp" "TCP queries" "ABSOLUTE"
244		p_config "num.query.tcpout" "TCP out queries" "ABSOLUTE"
245		p_config "num.query.tls" "TLS queries" "ABSOLUTE"
246		p_config "num.query.tls.resume" "TLS resumes" "ABSOLUTE"
247		p_config "num.query.ipv6" "IPv6 queries" "ABSOLUTE"
248		p_config "unwanted.queries" "queries that failed acl" "ABSOLUTE"
249		p_config "unwanted.replies" "unwanted or unsolicited replies" "ABSOLUTE"
250		echo "u_replies.warning $warn"
251		echo "u_replies.critical $crit"
252		echo "graph_info DNS queries to the recursive resolver. The unwanted replies could be innocent duplicate packets, late replies, or spoof threats."
253		;;
254	queue)
255		echo "graph_title Unbound requestlist size"
256		echo "graph_args --base 1000 -l 0"
257		echo "graph_vlabel number of queries"
258		echo "graph_scale no"
259		echo "graph_category DNS"
260		p_config "total.requestlist.avg" "Average size of queue on insert" "GAUGE"
261		p_config "total.requestlist.max" "Max size of queue (in 5 min)" "GAUGE"
262		p_config "total.requestlist.overwritten" "Number of queries replaced by new ones" "GAUGE"
263		p_config "total.requestlist.exceeded" "Number of queries dropped due to lack of space" "GAUGE"
264		echo "graph_info The queries that did not hit the cache and need recursion service take up space in the requestlist. If there are too many queries, first queries get overwritten, and at last resort dropped."
265		;;
266	memory)
267		echo "graph_title Unbound memory usage"
268		echo "graph_args --base 1024 -l 0"
269		echo "graph_vlabel memory used in bytes"
270		echo "graph_category DNS"
271		p_config "mem.cache.rrset" "RRset cache memory" "GAUGE"
272		p_config "mem.cache.message" "Message cache memory" "GAUGE"
273		p_config "mem.mod.iterator" "Iterator module memory" "GAUGE"
274		p_config "mem.mod.validator" "Validator module and key cache memory" "GAUGE"
275		p_config "msg.cache.count" "msg cache count" "GAUGE"
276		p_config "rrset.cache.count" "rrset cache count" "GAUGE"
277		p_config "infra.cache.count" "infra cache count" "GAUGE"
278		p_config "key.cache.count" "key cache count" "GAUGE"
279		echo "graph_info The memory used by unbound."
280		;;
281	by_type)
282		echo "graph_title Unbound DNS queries by type"
283		echo "graph_args --base 1000 -l 0"
284		echo "graph_vlabel queries / \${graph_period}"
285		echo "graph_scale no"
286		echo "graph_category DNS"
287		for x in `grep "^num.query.type" $state`; do
288			nm=`echo $x | sed -e 's/=.*$//'`
289			tp=`echo $nm | sed -e s/num.query.type.//`
290			p_config "$nm" "$tp" "ABSOLUTE"
291		done
292		echo "graph_info queries by DNS RR type queried for"
293		;;
294	by_class)
295		echo "graph_title Unbound DNS queries by class"
296		echo "graph_args --base 1000 -l 0"
297		echo "graph_vlabel queries / \${graph_period}"
298		echo "graph_scale no"
299		echo "graph_category DNS"
300		for x in `grep "^num.query.class" $state`; do
301			nm=`echo $x | sed -e 's/=.*$//'`
302			tp=`echo $nm | sed -e s/num.query.class.//`
303			p_config "$nm" "$tp" "ABSOLUTE"
304		done
305		echo "graph_info queries by DNS RR class queried for."
306		;;
307	by_opcode)
308		echo "graph_title Unbound DNS queries by opcode"
309		echo "graph_args --base 1000 -l 0"
310		echo "graph_vlabel queries / \${graph_period}"
311		echo "graph_scale no"
312		echo "graph_category DNS"
313		for x in `grep "^num.query.opcode" $state`; do
314			nm=`echo $x | sed -e 's/=.*$//'`
315			tp=`echo $nm | sed -e s/num.query.opcode.//`
316			p_config "$nm" "$tp" "ABSOLUTE"
317		done
318		echo "graph_info queries by opcode in the query packet."
319		;;
320	by_rcode)
321		echo "graph_title Unbound DNS answers by return code"
322		echo "graph_args --base 1000 -l 0"
323		echo "graph_vlabel answer packets / \${graph_period}"
324		echo "graph_scale no"
325		echo "graph_category DNS"
326		for x in `grep "^num.answer.rcode" $state`; do
327			nm=`echo $x | sed -e 's/=.*$//'`
328			tp=`echo $nm | sed -e s/num.answer.rcode.//`
329			p_config "$nm" "$tp" "ABSOLUTE"
330		done
331		p_config "num.answer.secure" "answer secure" "ABSOLUTE"
332		p_config "num.answer.bogus" "answer bogus" "ABSOLUTE"
333		p_config "num.rrset.bogus" "num rrsets marked bogus" "ABSOLUTE"
334		echo "graph_info answers sorted by return value. rrsets bogus is the number of rrsets marked bogus per \${graph_period} by the validator"
335		;;
336	by_flags)
337		echo "graph_title Unbound DNS incoming queries by flags"
338		echo "graph_args --base 1000 -l 0"
339		echo "graph_vlabel queries / \${graph_period}"
340		echo "graph_scale no"
341		echo "graph_category DNS"
342		p_config "num.query.flags.QR" "QR (query reply) flag" "ABSOLUTE"
343		p_config "num.query.flags.AA" "AA (auth answer) flag" "ABSOLUTE"
344		p_config "num.query.flags.TC" "TC (truncated) flag" "ABSOLUTE"
345		p_config "num.query.flags.RD" "RD (recursion desired) flag" "ABSOLUTE"
346		p_config "num.query.flags.RA" "RA (rec avail) flag" "ABSOLUTE"
347		p_config "num.query.flags.Z" "Z (zero) flag" "ABSOLUTE"
348		p_config "num.query.flags.AD" "AD (auth data) flag" "ABSOLUTE"
349		p_config "num.query.flags.CD" "CD (check disabled) flag" "ABSOLUTE"
350		p_config "num.query.edns.present" "EDNS OPT present" "ABSOLUTE"
351		p_config "num.query.edns.DO" "DO (DNSSEC OK) flag" "ABSOLUTE"
352		echo "graph_info This graphs plots the flags inside incoming queries. For example, if QR, AA, TC, RA, Z flags are set, the query can be rejected. RD, AD, CD and DO are legitimately set by some software."
353		;;
354	histogram)
355		echo "graph_title Unbound DNS histogram of reply time"
356		echo "graph_args --base 1000 -l 0"
357		echo "graph_vlabel queries / \${graph_period}"
358		echo "graph_scale no"
359		echo "graph_category DNS"
360		echo hcache.label "cache hits"
361		echo hcache.min 0
362		echo hcache.type ABSOLUTE
363		echo hcache.draw AREA
364		echo hcache.colour 999999
365		echo h64ms.label "0 msec - 66 msec"
366		echo h64ms.min 0
367		echo h64ms.type ABSOLUTE
368		echo h64ms.draw STACK
369		echo h64ms.colour 0000FF
370		echo h128ms.label "66 msec - 131 msec"
371		echo h128ms.min 0
372		echo h128ms.type ABSOLUTE
373		echo h128ms.colour 1F00DF
374		echo h128ms.draw STACK
375		echo h256ms.label "131 msec - 262 msec"
376		echo h256ms.min 0
377		echo h256ms.type ABSOLUTE
378		echo h256ms.draw STACK
379		echo h256ms.colour 3F00BF
380		echo h512ms.label "262 msec - 524 msec"
381		echo h512ms.min 0
382		echo h512ms.type ABSOLUTE
383		echo h512ms.draw STACK
384		echo h512ms.colour 5F009F
385		echo h1s.label "524 msec - 1 sec"
386		echo h1s.min 0
387		echo h1s.type ABSOLUTE
388		echo h1s.draw STACK
389		echo h1s.colour 7F007F
390		echo h2s.label "1 sec - 2 sec"
391		echo h2s.min 0
392		echo h2s.type ABSOLUTE
393		echo h2s.draw STACK
394		echo h2s.colour 9F005F
395		echo h4s.label "2 sec - 4 sec"
396		echo h4s.min 0
397		echo h4s.type ABSOLUTE
398		echo h4s.draw STACK
399		echo h4s.colour BF003F
400		echo h8s.label "4 sec - 8 sec"
401		echo h8s.min 0
402		echo h8s.type ABSOLUTE
403		echo h8s.draw STACK
404		echo h8s.colour DF001F
405		echo h16s.label "8 sec - ..."
406		echo h16s.min 0
407		echo h16s.type ABSOLUTE
408		echo h16s.draw STACK
409		echo h16s.colour FF0000
410		echo "graph_info Histogram of the reply times for queries."
411		;;
412	esac
413
414	exit 0
415fi
416
417# do the stats itself
418get_state
419
420# get the time elapsed
421get_value "time.elapsed"
422if test $value = 0 || test $value = "0.000000"; then
423	echo "error: time elapsed 0 or could not retrieve data"
424	exit 1
425fi
426elapsed="$value"
427
428# print value for $1
429print_value ( ) {
430	mn=`echo $1 | sed $ABBREV | tr . _`
431	get_value $1
432	echo "$mn.value" $value
433}
434
435# print value if line already found in $2
436print_value_line ( ) {
437	mn=`echo $1 | sed $ABBREV | tr . _`
438	value="`echo $2 | sed -e 's/^.*=//'`"
439	echo "$mn.value" $value
440}
441
442
443case $id in
444hits)
445	for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
446		sed -e 's/=.*//'` total.num.queries \
447		total.num.cachehits total.num.prefetch num.query.tcp \
448		num.query.tcpout num.query.tls num.query.tls.resume \
449		num.query.ipv6 unwanted.queries \
450		unwanted.replies; do
451		if grep "^"$x"=" $state >/dev/null 2>&1; then
452			print_value $x
453		fi
454	done
455	;;
456queue)
457	for x in total.requestlist.avg total.requestlist.max \
458		total.requestlist.overwritten total.requestlist.exceeded; do
459		print_value $x
460	done
461	;;
462memory)
463	for x in mem.cache.rrset mem.cache.message mem.mod.iterator \
464		mem.mod.validator msg.cache.count rrset.cache.count \
465		infra.cache.count key.cache.count; do
466		print_value $x
467	done
468	;;
469by_type)
470	for x in `grep "^num.query.type" $state`; do
471		nm=`echo $x | sed -e 's/=.*$//'`
472		print_value_line $nm $x
473	done
474	;;
475by_class)
476	for x in `grep "^num.query.class" $state`; do
477		nm=`echo $x | sed -e 's/=.*$//'`
478		print_value_line $nm $x
479	done
480	;;
481by_opcode)
482	for x in `grep "^num.query.opcode" $state`; do
483		nm=`echo $x | sed -e 's/=.*$//'`
484		print_value_line $nm $x
485	done
486	;;
487by_rcode)
488	for x in `grep "^num.answer.rcode" $state`; do
489		nm=`echo $x | sed -e 's/=.*$//'`
490		print_value_line $nm $x
491	done
492	print_value "num.answer.secure"
493	print_value "num.answer.bogus"
494	print_value "num.rrset.bogus"
495	;;
496by_flags)
497	for x in num.query.flags.QR num.query.flags.AA num.query.flags.TC num.query.flags.RD num.query.flags.RA num.query.flags.Z num.query.flags.AD num.query.flags.CD num.query.edns.present num.query.edns.DO; do
498		print_value $x
499	done
500	;;
501histogram)
502	get_value total.num.cachehits
503	echo hcache.value $value
504	r=0
505	for x in histogram.000000.000000.to.000000.000001 \
506		histogram.000000.000001.to.000000.000002 \
507		histogram.000000.000002.to.000000.000004 \
508		histogram.000000.000004.to.000000.000008 \
509		histogram.000000.000008.to.000000.000016 \
510		histogram.000000.000016.to.000000.000032 \
511		histogram.000000.000032.to.000000.000064 \
512		histogram.000000.000064.to.000000.000128 \
513		histogram.000000.000128.to.000000.000256 \
514		histogram.000000.000256.to.000000.000512 \
515		histogram.000000.000512.to.000000.001024 \
516		histogram.000000.001024.to.000000.002048 \
517		histogram.000000.002048.to.000000.004096 \
518		histogram.000000.004096.to.000000.008192 \
519		histogram.000000.008192.to.000000.016384 \
520		histogram.000000.016384.to.000000.032768 \
521		histogram.000000.032768.to.000000.065536; do
522		get_value $x
523		r=`expr $r + $value`
524	done
525	echo h64ms.value $r
526	get_value histogram.000000.065536.to.000000.131072
527	echo h128ms.value $value
528	get_value histogram.000000.131072.to.000000.262144
529	echo h256ms.value $value
530	get_value histogram.000000.262144.to.000000.524288
531	echo h512ms.value $value
532	get_value histogram.000000.524288.to.000001.000000
533	echo h1s.value $value
534	get_value histogram.000001.000000.to.000002.000000
535	echo h2s.value $value
536	get_value histogram.000002.000000.to.000004.000000
537	echo h4s.value $value
538	get_value histogram.000004.000000.to.000008.000000
539	echo h8s.value $value
540	r=0
541	for x in histogram.000008.000000.to.000016.000000 \
542		histogram.000016.000000.to.000032.000000 \
543		histogram.000032.000000.to.000064.000000 \
544		histogram.000064.000000.to.000128.000000 \
545		histogram.000128.000000.to.000256.000000 \
546		histogram.000256.000000.to.000512.000000 \
547		histogram.000512.000000.to.001024.000000 \
548		histogram.001024.000000.to.002048.000000 \
549		histogram.002048.000000.to.004096.000000 \
550		histogram.004096.000000.to.008192.000000 \
551		histogram.008192.000000.to.016384.000000 \
552		histogram.016384.000000.to.032768.000000 \
553		histogram.032768.000000.to.065536.000000 \
554		histogram.065536.000000.to.131072.000000 \
555		histogram.131072.000000.to.262144.000000 \
556		histogram.262144.000000.to.524288.000000; do
557		get_value $x
558		r=`expr $r + $value`
559	done
560	echo h16s.value $r
561	;;
562esac
563