1#!/bin/sh
2#
3# plugin for munin to monitor usage of unbound servers.
4# To install copy this to /usr/local/share/munin/plugins/unbound_munin_
5# and use munin-node-configure (--suggest, --shell).
6#
7# (C) 2008 W.C.A. Wijngaards.  BSD Licensed.
8#
9# To install; enable statistics and unbound-control in unbound.conf
10#	server:		extended-statistics: yes
11#			statistics-cumulative: no
12#			statistics-interval: 0
13#	remote-control:	control-enable: yes
14# Run the command unbound-control-setup to generate the key files.
15#
16# Environment variables for this script
17#	unbound_conf	- where the unbound.conf file is located.
18#	unbound_control	- where to find unbound-control executable.
19#	spoof_warn	- what level to warn about spoofing
20#	spoof_crit	- what level to crit about spoofing
21#
22# You can set them in your munin/plugin-conf.d/plugins.conf file
23# with:
24# [unbound*]
25# user root
26# env.unbound_conf /usr/local/etc/unbound/unbound.conf
27# env.unbound_control /usr/local/sbin/unbound-control
28# env.spoof_warn 1000
29# env.spoof_crit 100000
30#
31# This plugin can create different graphs depending on what name
32# you link it as (with ln -s) into the plugins directory
33# You can link it multiple times.
34# If you are only a casual user, the _hits and _by_type are most interesting,
35# possibly followed by _by_rcode.
36#
37#	unbound_munin_hits	- base volume, cache hits, unwanted traffic
38#	unbound_munin_queue	- to monitor the internal requestlist
39#	unbound_munin_memory	- memory usage
40#	unbound_munin_by_type	- incoming queries by type
41#	unbound_munin_by_class	- incoming queries by class
42#	unbound_munin_by_opcode	- incoming queries by opcode
43#	unbound_munin_by_rcode	- answers by rcode, validation status
44#	unbound_munin_by_flags	- incoming queries by flags
45#	unbound_munin_histogram	- histogram of query resolving times
46#
47# Magic markers - optional - used by installation scripts and
48# munin-config:  (originally contrib family but munin-node-configure ignores it)
49#
50#%# family=auto
51#%# capabilities=autoconf suggest
52
53# POD documentation
54: <<=cut
55=head1 NAME
56
57unbound_munin_ - Munin plugin to monitor the Unbound DNS resolver.
58
59=head1 APPLICABLE SYSTEMS
60
61System with unbound daemon.
62
63=head1 CONFIGURATION
64
65  [unbound*]
66  user root
67  env.unbound_conf /usr/local/etc/unbound/unbound.conf
68  env.unbound_control /usr/local/sbin/unbound-control
69  env.spoof_warn 1000
70  env.spoof_crit 100000
71
72Use the .env settings to override the defaults.
73
74=head1 USAGE
75
76Can be used to present different graphs. Use ln -s for that name in
77the plugins directory to enable the graph.
78unbound_munin_hits	- base volume, cache hits, unwanted traffic
79unbound_munin_queue	- to monitor the internal requestlist
80unbound_munin_memory	- memory usage
81unbound_munin_by_type	- incoming queries by type
82unbound_munin_by_class	- incoming queries by class
83unbound_munin_by_opcode	- incoming queries by opcode
84unbound_munin_by_rcode	- answers by rcode, validation status
85unbound_munin_by_flags	- incoming queries by flags
86unbound_munin_histogram - histogram of query resolving times
87
88=head1 AUTHOR
89
90Copyright 2008 W.C.A. Wijngaards
91
92=head1 LICENSE
93
94BSD
95
96=cut
97
98state="${MUNIN_PLUGSTATE}/unbound.state"
99seentags="${MUNIN_PLUGSTATE}/unbound-seentags.state"
100conf=${unbound_conf:-/usr/local/etc/unbound/unbound.conf}
101ctrl=${unbound_control:-/usr/local/sbin/unbound-control}
102warn=${spoof_warn:-1000}
103crit=${spoof_crit:-100000}
104lock=$state.lock
105
106# number of seconds between polling attempts.
107# makes the statefile hang around for at least this many seconds,
108# so that multiple links of this script can share the results.
109lee=55
110
111# to keep things within 19 characters
112ABBREV="-e s/total/t/ -e s/thread/t/ -e s/num/n/ -e s/query/q/ -e s/answer/a/ -e s/unwanted/u/ -e s/requestlist/ql/ -e s/type/t/ -e s/class/c/ -e s/opcode/o/ -e s/rcode/r/ -e s/edns/e/ -e s/mem/m/ -e s/cache/c/ -e s/mod/m/"
113
114# get value from $1 into return variable $value
115get_value ( ) {
116	value="`grep '^'$1'=' $state | sed -e 's/^.*=//'`"
117	if test "$value"x = ""x; then
118		value="0"
119	fi
120}
121
122# Update list of seen query types etc to seentags file. This is run while
123# holding the lock, after the state file is updated.
124update_seentags() {
125    tmplist="$(cat ${seentags} 2> /dev/null)
126num.query.type.A
127num.query.class.IN
128num.query.opcode.QUERY
129num.answer.rcode.NOERROR
130"
131    (echo "${tmplist}"; grep ^num ${state} | sed -e 's/=.*//') | sort -u > ${seentags}
132}
133
134# download the state from the unbound server.
135get_state ( ) {
136	# obtain lock for fetching the state
137	# because there is a race condition in fetching and writing to file
138
139	# see if the lock is stale, if so, take it
140	if test -f $lock ; then
141		pid="`cat $lock 2>&1`"
142		kill -0 "$pid" >/dev/null 2>&1
143		if test $? -ne 0 -a "$pid" != $$ ; then
144			echo $$ >$lock
145		fi
146	fi
147
148	i=0
149	while test ! -f $lock || test "`cat $lock 2>&1`" != $$; do
150		while test -f $lock; do
151			# wait
152			i=`expr $i + 1`
153			if test $i -gt 1000; then
154				sleep 1;
155			fi
156			if test $i -gt 1500; then
157				echo "error locking $lock" "=" `cat $lock`
158				rm -f $lock
159				exit 1
160			fi
161		done
162		# try to get it
163		if echo $$ >$lock ; then : ; else break; fi
164	done
165	# do not refetch if the file exists and only LEE seconds old
166	if test -f $state; then
167		now=`date +%s`
168		get_value "time.now"
169		value="`echo $value | sed -e 's/\..*$//'`"
170		if test $now -lt `expr $value + $lee`; then
171			rm -f $lock
172			return
173		fi
174	fi
175	$ctrl -c $conf stats > $state
176	if test $? -ne 0; then
177		echo "error retrieving data from unbound server"
178		rm -f $lock
179		exit 1
180	fi
181	update_seentags
182	rm -f $lock
183}
184
185if test "$1" = "autoconf" ; then
186	if test ! -f $conf; then
187		echo no "($conf does not exist)"
188		exit 0
189	fi
190	if test ! -d `dirname $state`; then
191		echo no "(`dirname $state` directory does not exist)"
192		exit 0
193	fi
194	echo yes
195	exit 0
196fi
197
198if test "$1" = "suggest" ; then
199	echo "hits"
200	echo "queue"
201	echo "memory"
202	echo "by_type"
203	echo "by_class"
204	echo "by_opcode"
205	echo "by_rcode"
206	echo "by_flags"
207	echo "histogram"
208	exit 0
209fi
210
211# determine my type, by name
212id=`echo $0 | sed -e 's/^.*unbound_munin_//'`
213if test "$id"x = ""x; then
214	# some default to keep people sane.
215	id="hits"
216fi
217
218# if $1 exists in statefile, config is echoed with label $2
219exist_config ( ) {
220	mn=`echo $1 | sed $ABBREV | tr . _`
221	if grep '^'$1'=' $state >/dev/null 2>&1; then
222		echo "$mn.label $2"
223		echo "$mn.min 0"
224		echo "$mn.type ABSOLUTE"
225	fi
226}
227
228# print label and min 0 for a name $1 in unbound format
229p_config ( ) {
230	mn=`echo $1 | sed $ABBREV | tr . _`
231	echo $mn.label "$2"
232	echo $mn.min 0
233	echo $mn.type $3
234}
235
236if test "$1" = "config" ; then
237	if test ! -f $state; then
238		get_state
239	fi
240	case $id in
241	hits)
242		echo "graph_title Unbound DNS traffic and cache hits"
243		echo "graph_args --base 1000 -l 0"
244		echo "graph_vlabel queries / \${graph_period}"
245		echo "graph_scale no"
246		echo "graph_category dns"
247		for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
248			sed -e 's/=.*//'`; do
249			exist_config $x "queries handled by `basename $x .num.queries`"
250		done
251		p_config "total.num.queries" "total queries from clients" "ABSOLUTE"
252		p_config "total.num.cachehits" "cache hits" "ABSOLUTE"
253		p_config "total.num.prefetch" "cache prefetch" "ABSOLUTE"
254		p_config "num.query.tcp" "TCP queries" "ABSOLUTE"
255		p_config "num.query.tcpout" "TCP out queries" "ABSOLUTE"
256		p_config "num.query.udpout" "UDP out queries" "ABSOLUTE"
257		p_config "num.query.tls" "TLS queries" "ABSOLUTE"
258		p_config "num.query.tls.resume" "TLS resumes" "ABSOLUTE"
259		p_config "num.query.ipv6" "IPv6 queries" "ABSOLUTE"
260		p_config "unwanted.queries" "queries that failed acl" "ABSOLUTE"
261		p_config "unwanted.replies" "unwanted or unsolicited replies" "ABSOLUTE"
262		echo "u_replies.warning $warn"
263		echo "u_replies.critical $crit"
264		echo "graph_info DNS queries to the recursive resolver. The unwanted replies could be innocent duplicate packets, late replies, or spoof threats."
265		;;
266	queue)
267		echo "graph_title Unbound requestlist size"
268		echo "graph_args --base 1000 -l 0"
269		echo "graph_vlabel number of queries"
270		echo "graph_scale no"
271		echo "graph_category dns"
272		p_config "total.requestlist.avg" "Average size of queue on insert" "GAUGE"
273		p_config "total.requestlist.max" "Max size of queue (in 5 min)" "GAUGE"
274		p_config "total.requestlist.overwritten" "Number of queries replaced by new ones" "GAUGE"
275		p_config "total.requestlist.exceeded" "Number of queries dropped due to lack of space" "GAUGE"
276		echo "graph_info The queries that did not hit the cache and need recursion service take up space in the requestlist. If there are too many queries, first queries get overwritten, and at last resort dropped."
277		;;
278	memory)
279		echo "graph_title Unbound memory usage"
280		echo "graph_args --base 1024 -l 0"
281		echo "graph_vlabel memory used in bytes"
282		echo "graph_category dns"
283		p_config "mem.cache.rrset" "RRset cache memory" "GAUGE"
284		p_config "mem.cache.message" "Message cache memory" "GAUGE"
285		p_config "mem.mod.iterator" "Iterator module memory" "GAUGE"
286		p_config "mem.mod.validator" "Validator module and key cache memory" "GAUGE"
287		p_config "msg.cache.count" "msg cache count" "GAUGE"
288		p_config "rrset.cache.count" "rrset cache count" "GAUGE"
289		p_config "infra.cache.count" "infra cache count" "GAUGE"
290		p_config "key.cache.count" "key cache count" "GAUGE"
291		echo "graph_info The memory used by unbound."
292		;;
293	by_type)
294		echo "graph_title Unbound DNS queries by type"
295		echo "graph_args --base 1000 -l 0"
296		echo "graph_vlabel queries / \${graph_period}"
297		echo "graph_scale no"
298		echo "graph_category dns"
299		for nm in `grep "^num.query.type" $seentags`; do
300			tp=`echo $nm | sed -e s/num.query.type.//`
301			p_config "$nm" "$tp" "ABSOLUTE"
302		done
303		echo "graph_info queries by DNS RR type queried for"
304		;;
305	by_class)
306		echo "graph_title Unbound DNS queries by class"
307		echo "graph_args --base 1000 -l 0"
308		echo "graph_vlabel queries / \${graph_period}"
309		echo "graph_scale no"
310		echo "graph_category dns"
311		for nm in `grep "^num.query.class" $seentags`; do
312			tp=`echo $nm | sed -e s/num.query.class.//`
313			p_config "$nm" "$tp" "ABSOLUTE"
314		done
315		echo "graph_info queries by DNS RR class queried for."
316		;;
317	by_opcode)
318		echo "graph_title Unbound DNS queries by opcode"
319		echo "graph_args --base 1000 -l 0"
320		echo "graph_vlabel queries / \${graph_period}"
321		echo "graph_scale no"
322		echo "graph_category dns"
323		for nm in `grep "^num.query.opcode" $seentags`; do
324			tp=`echo $nm | sed -e s/num.query.opcode.//`
325			p_config "$nm" "$tp" "ABSOLUTE"
326		done
327		echo "graph_info queries by opcode in the query packet."
328		;;
329	by_rcode)
330		echo "graph_title Unbound DNS answers by return code"
331		echo "graph_args --base 1000 -l 0"
332		echo "graph_vlabel answer packets / \${graph_period}"
333		echo "graph_scale no"
334		echo "graph_category dns"
335		for nm in `grep "^num.answer.rcode" $seentags`; do
336			tp=`echo $nm | sed -e s/num.answer.rcode.//`
337			p_config "$nm" "$tp" "ABSOLUTE"
338		done
339		p_config "num.answer.secure" "answer secure" "ABSOLUTE"
340		p_config "num.answer.bogus" "answer bogus" "ABSOLUTE"
341		p_config "num.rrset.bogus" "num rrsets marked bogus" "ABSOLUTE"
342		echo "graph_info answers sorted by return value. rrsets bogus is the number of rrsets marked bogus per \${graph_period} by the validator"
343		;;
344	by_flags)
345		echo "graph_title Unbound DNS incoming queries by flags"
346		echo "graph_args --base 1000 -l 0"
347		echo "graph_vlabel queries / \${graph_period}"
348		echo "graph_scale no"
349		echo "graph_category dns"
350		p_config "num.query.flags.QR" "QR (query reply) flag" "ABSOLUTE"
351		p_config "num.query.flags.AA" "AA (auth answer) flag" "ABSOLUTE"
352		p_config "num.query.flags.TC" "TC (truncated) flag" "ABSOLUTE"
353		p_config "num.query.flags.RD" "RD (recursion desired) flag" "ABSOLUTE"
354		p_config "num.query.flags.RA" "RA (rec avail) flag" "ABSOLUTE"
355		p_config "num.query.flags.Z" "Z (zero) flag" "ABSOLUTE"
356		p_config "num.query.flags.AD" "AD (auth data) flag" "ABSOLUTE"
357		p_config "num.query.flags.CD" "CD (check disabled) flag" "ABSOLUTE"
358		p_config "num.query.edns.present" "EDNS OPT present" "ABSOLUTE"
359		p_config "num.query.edns.DO" "DO (DNSSEC OK) flag" "ABSOLUTE"
360		echo "graph_info This graphs plots the flags inside incoming queries. For example, if QR, AA, TC, RA, Z flags are set, the query can be rejected. RD, AD, CD and DO are legitimately set by some software."
361		;;
362	histogram)
363		echo "graph_title Unbound DNS histogram of reply time"
364		echo "graph_args --base 1000 -l 0"
365		echo "graph_vlabel queries / \${graph_period}"
366		echo "graph_scale no"
367		echo "graph_category dns"
368		echo hcache.label "cache hits"
369		echo hcache.min 0
370		echo hcache.type ABSOLUTE
371		echo hcache.draw AREA
372		echo hcache.colour 999999
373		echo h64ms.label "0 msec - 66 msec"
374		echo h64ms.min 0
375		echo h64ms.type ABSOLUTE
376		echo h64ms.draw STACK
377		echo h64ms.colour 0000FF
378		echo h128ms.label "66 msec - 131 msec"
379		echo h128ms.min 0
380		echo h128ms.type ABSOLUTE
381		echo h128ms.colour 1F00DF
382		echo h128ms.draw STACK
383		echo h256ms.label "131 msec - 262 msec"
384		echo h256ms.min 0
385		echo h256ms.type ABSOLUTE
386		echo h256ms.draw STACK
387		echo h256ms.colour 3F00BF
388		echo h512ms.label "262 msec - 524 msec"
389		echo h512ms.min 0
390		echo h512ms.type ABSOLUTE
391		echo h512ms.draw STACK
392		echo h512ms.colour 5F009F
393		echo h1s.label "524 msec - 1 sec"
394		echo h1s.min 0
395		echo h1s.type ABSOLUTE
396		echo h1s.draw STACK
397		echo h1s.colour 7F007F
398		echo h2s.label "1 sec - 2 sec"
399		echo h2s.min 0
400		echo h2s.type ABSOLUTE
401		echo h2s.draw STACK
402		echo h2s.colour 9F005F
403		echo h4s.label "2 sec - 4 sec"
404		echo h4s.min 0
405		echo h4s.type ABSOLUTE
406		echo h4s.draw STACK
407		echo h4s.colour BF003F
408		echo h8s.label "4 sec - 8 sec"
409		echo h8s.min 0
410		echo h8s.type ABSOLUTE
411		echo h8s.draw STACK
412		echo h8s.colour DF001F
413		echo h16s.label "8 sec - ..."
414		echo h16s.min 0
415		echo h16s.type ABSOLUTE
416		echo h16s.draw STACK
417		echo h16s.colour FF0000
418		echo "graph_info Histogram of the reply times for queries."
419		;;
420	esac
421
422	exit 0
423fi
424
425# do the stats itself
426get_state
427
428# get the time elapsed
429get_value "time.elapsed"
430if test $value = 0 || test $value = "0.000000"; then
431	echo "error: time elapsed 0 or could not retrieve data"
432	exit 1
433fi
434elapsed="$value"
435
436# print value for $1
437print_value ( ) {
438	mn=`echo $1 | sed $ABBREV | tr . _`
439	get_value $1
440	echo "$mn.value" $value
441}
442
443# print value if line already found in $2
444print_value_line ( ) {
445	mn=`echo $1 | sed $ABBREV | tr . _`
446	value="`echo $2 | sed -e 's/^.*=//'`"
447	echo "$mn.value" $value
448}
449
450
451case $id in
452hits)
453	for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
454		sed -e 's/=.*//'` total.num.queries \
455		total.num.cachehits total.num.prefetch num.query.tcp \
456		num.query.tcpout num.query.udpout num.query.tls num.query.tls.resume \
457		num.query.ipv6 unwanted.queries \
458		unwanted.replies; do
459		if grep "^"$x"=" $state >/dev/null 2>&1; then
460			print_value $x
461		fi
462	done
463	;;
464queue)
465	for x in total.requestlist.avg total.requestlist.max \
466		total.requestlist.overwritten total.requestlist.exceeded; do
467		print_value $x
468	done
469	;;
470memory)
471	for x in mem.cache.rrset mem.cache.message mem.mod.iterator \
472		mem.mod.validator msg.cache.count rrset.cache.count \
473		infra.cache.count key.cache.count; do
474		print_value $x
475	done
476	;;
477by_type)
478	for nm in `grep "^num.query.type" $seentags`; do
479		print_value $nm
480	done
481	;;
482by_class)
483	for nm in `grep "^num.query.class" $seentags`; do
484		print_value $nm
485	done
486	;;
487by_opcode)
488	for nm in `grep "^num.query.opcode" $seentags`; do
489		print_value $nm
490	done
491	;;
492by_rcode)
493	for nm in `grep "^num.answer.rcode" $seentags`; do
494		print_value $nm
495	done
496	print_value "num.answer.secure"
497	print_value "num.answer.bogus"
498	print_value "num.rrset.bogus"
499	;;
500by_flags)
501	for x in num.query.flags.QR num.query.flags.AA num.query.flags.TC num.query.flags.RD num.query.flags.RA num.query.flags.Z num.query.flags.AD num.query.flags.CD num.query.edns.present num.query.edns.DO; do
502		print_value $x
503	done
504	;;
505histogram)
506	get_value total.num.cachehits
507	echo hcache.value $value
508	r=0
509	for x in histogram.000000.000000.to.000000.000001 \
510		histogram.000000.000001.to.000000.000002 \
511		histogram.000000.000002.to.000000.000004 \
512		histogram.000000.000004.to.000000.000008 \
513		histogram.000000.000008.to.000000.000016 \
514		histogram.000000.000016.to.000000.000032 \
515		histogram.000000.000032.to.000000.000064 \
516		histogram.000000.000064.to.000000.000128 \
517		histogram.000000.000128.to.000000.000256 \
518		histogram.000000.000256.to.000000.000512 \
519		histogram.000000.000512.to.000000.001024 \
520		histogram.000000.001024.to.000000.002048 \
521		histogram.000000.002048.to.000000.004096 \
522		histogram.000000.004096.to.000000.008192 \
523		histogram.000000.008192.to.000000.016384 \
524		histogram.000000.016384.to.000000.032768 \
525		histogram.000000.032768.to.000000.065536; do
526		get_value $x
527		r=`expr $r + $value`
528	done
529	echo h64ms.value $r
530	get_value histogram.000000.065536.to.000000.131072
531	echo h128ms.value $value
532	get_value histogram.000000.131072.to.000000.262144
533	echo h256ms.value $value
534	get_value histogram.000000.262144.to.000000.524288
535	echo h512ms.value $value
536	get_value histogram.000000.524288.to.000001.000000
537	echo h1s.value $value
538	get_value histogram.000001.000000.to.000002.000000
539	echo h2s.value $value
540	get_value histogram.000002.000000.to.000004.000000
541	echo h4s.value $value
542	get_value histogram.000004.000000.to.000008.000000
543	echo h8s.value $value
544	r=0
545	for x in histogram.000008.000000.to.000016.000000 \
546		histogram.000016.000000.to.000032.000000 \
547		histogram.000032.000000.to.000064.000000 \
548		histogram.000064.000000.to.000128.000000 \
549		histogram.000128.000000.to.000256.000000 \
550		histogram.000256.000000.to.000512.000000 \
551		histogram.000512.000000.to.001024.000000 \
552		histogram.001024.000000.to.002048.000000 \
553		histogram.002048.000000.to.004096.000000 \
554		histogram.004096.000000.to.008192.000000 \
555		histogram.008192.000000.to.016384.000000 \
556		histogram.016384.000000.to.032768.000000 \
557		histogram.032768.000000.to.065536.000000 \
558		histogram.065536.000000.to.131072.000000 \
559		histogram.131072.000000.to.262144.000000 \
560		histogram.262144.000000.to.524288.000000; do
561		get_value $x
562		r=`expr $r + $value`
563	done
564	echo h16s.value $r
565	;;
566esac
567