1219820Sjeff#!/bin/bash
2219820Sjeff#
3219820Sjeff# Copyright (c) 2008 Voltaire, Inc. All rights reserved.
4219820Sjeff# Copyright (c) 2006 Mellanox Technologies. All rights reserved.
5219820Sjeff#
6219820Sjeff# This Software is licensed under one of the following licenses:
7219820Sjeff#
8219820Sjeff# 1) under the terms of the "Common Public License 1.0" a copy of which is
9219820Sjeff#    available from the Open Source Initiative, see
10219820Sjeff#    http://www.opensource.org/licenses/cpl.php.
11219820Sjeff#
12219820Sjeff# 2) under the terms of the "The BSD License" a copy of which is
13219820Sjeff#    available from the Open Source Initiative, see
14219820Sjeff#    http://www.opensource.org/licenses/bsd-license.php.
15219820Sjeff#
16219820Sjeff# 3) under the terms of the "GNU General Public License (GPL) Version 2" a
17219820Sjeff#    copy of which is available from the Open Source Initiative, see
18219820Sjeff#    http://www.opensource.org/licenses/gpl-license.php.
19219820Sjeff#
20219820Sjeff# Licensee has the right to choose one of the above licenses.
21219820Sjeff#
22219820Sjeff# Redistributions of source code must retain the above copyright
23219820Sjeff# notice and one of the license notices.
24219820Sjeff#
25219820Sjeff# Redistributions in binary form must reproduce both the above copyright
26219820Sjeff# notice, one of the license notices in the documentation
27219820Sjeff# and/or other materials provided with the distribution.
28219820Sjeff#
29219820Sjeff#
30219820Sjeff
31219820Sjeff# OpenSM found to have the following problem
32219820Sjeff# when handover is performed:
33219820Sjeff# If some of the cluster nodes are rebooted during the handover they loose their LID assignment.
34219820Sjeff# The reason for it is that the standby SM does not obey its own Guid to LID table
35219820Sjeff# and simply uses the discovered LIDs. If some nodes are not available for it
36219820Sjeff# their previous LID assignment is lost forever.
37219820Sjeff
38219820Sjeff# The idea is to use an external daemon that will distribute
39219820Sjeff# the semi-static LID assignment table from the master SM to all standby SMs.
40219820Sjeff# A standby SM, becoming a master . needs to obey the copied semi static LID assignment table.
41219820Sjeff
42219820Sjeffprefix=@prefix@
43219820Sjeffexec_prefix=@exec_prefix@
44219820Sjeff
45219820SjeffCONFIG=@sysconfdir@/sysconfig/opensm
46219820Sjeffif [ -f $CONFIG ]; then
47219820Sjeff	. $CONFIG
48219820Sjefffi
49219820Sjeff
50219820SjeffSLDD_DEBUG=${SLDD_DEBUG:-0}
51219820Sjeff
52219820SjeffCACHE_FILE=${CACHE_FILE:-/var/cache/opensm/guid2lid}
53219820SjeffCACHE_DIR=$(dirname ${CACHE_FILE})
54219820Sjefftmp_cache=${CACHE_FILE}.tmp
55219820Sjeff
56219820SjeffPING='ping -w 1 -c 1'
57219820Sjeff
58219820SjeffRCP=${RCP:-/usr/bin/scp}
59219820SjeffRSH=${RSH:-/usr/bin/ssh}
60219820SjeffIFCONFIG=${IFCONFIG:-'/sbin/ifconfig -a'}
61219820Sjeff
62219820Sjeffdeclare -i SLDD_DEBUG
63219820SjeffRESCAN_TIME=${RESCAN_TIME:-60}
64219820Sjeff
65219820Sjeffif [ -z "${OSM_HOSTS}" ]; then
66219820Sjeff	[ $SLDD_DEBUG -eq 1 ] &&
67219820Sjeff	echo "No OpenSM servers (OSM_HOSTS) configured for the IB subnet."
68219820Sjeff	exit 0
69219820Sjefffi
70219820Sjeff
71219820Sjeff
72219820Sjeffdeclare -a arr_OSM_HOSTS
73219820Sjeffarr_OSM_HOSTS=(${OSM_HOSTS})
74219820Sjeff
75219820Sjeffnum_of_osm_hosts=${#arr_OSM_HOSTS[@]}
76219820Sjeff
77219820Sjeffif [ ${num_of_osm_hosts} -eq 1 ]; then
78219820Sjeff	[ $SLDD_DEBUG -eq 1 ] &&
79219820Sjeff	echo "One OpenSM server configured in the IB subnet." &&
80219820Sjeff	echo "Nothing to be done for SLDD"
81219820Sjeff
82219820Sjeff	exit 0
83219820Sjefffi
84219820Sjeff
85219820Sjefftrap 'trap_handler' 15
86219820Sjeff
87219820Sjefftrap_handler()
88219820Sjeff{
89219820Sjeff	logger -i "SLDD: Exiting."
90219820Sjeff	exit 0
91219820Sjeff}
92219820Sjeff
93219820Sjeffis_alive()
94219820Sjeff{
95219820Sjeff	$PING $1 > /dev/null 2>&1
96219820Sjeff	return $?
97219820Sjeff}
98219820Sjeff
99219820Sjeffis_local()
100219820Sjeff{
101219820Sjeff	$IFCONFIG | grep -w "$1" > /dev/null 2>&1
102219820Sjeff	return $?
103219820Sjeff}
104219820Sjeff
105219820Sjeffupdate_remote_cache()
106219820Sjeff{
107219820Sjeff	/bin/rm -f ${CACHE_FILE}.upd
108219820Sjeff	/bin/cp -a ${CACHE_FILE} ${CACHE_FILE}.upd
109219820Sjeff
110219820Sjeff	[ $SLDD_DEBUG -eq 1 ] &&
111219820Sjeff	echo "Updating remote cache file"
112219820Sjeff
113219820Sjeff	for host in ${OSM_HOSTS}
114219820Sjeff	do
115219820Sjeff		# Skip local host update
116219820Sjeff		if [ "${host}" == "${local_host}" ]; then
117219820Sjeff			continue
118219820Sjeff		fi
119219820Sjeff
120219820Sjeff		if is_alive $host; then
121219820Sjeff			stat=$($RSH $host "/bin/mkdir -p ${CACHE_DIR} > /dev/null 2>&1; /bin/rm -f ${CACHE_FILE}.${local_host} > /dev/null 2>&1; echo \$?" | tr -d '[:space:]')
122219820Sjeff			if [ "X${stat}" == "X0" ]; then
123219820Sjeff				[ $SLDD_DEBUG -eq 1 ] &&
124219820Sjeff				echo "Updating $host"
125219820Sjeff				logger -i "SLDD: updating $host with ${CACHE_FILE}"
126219820Sjeff				$RCP ${CACHE_FILE}.upd ${host}:${CACHE_FILE}.${local_host}
127219820Sjeff				/bin/cp ${CACHE_FILE}.upd ${CACHE_FILE}.${host}
128219820Sjeff			else
129219820Sjeff				[ $SLDD_DEBUG -eq 1 ] &&
130219820Sjeff				echo "$RSH to $host failed."
131219820Sjeff				logger -i "SLDD: Failed to update $host with ${CACHE_FILE}. $RSH without password should be enabled"
132219820Sjeff				exit 5
133219820Sjeff			fi
134219820Sjeff		else
135219820Sjeff			[ $SLDD_DEBUG -eq 1 ] &&
136219820Sjeff			echo "$host is down."
137219820Sjeff			continue
138219820Sjeff		fi
139219820Sjeff	done
140219820Sjeff}
141219820Sjeff
142219820Sjeffget_latest_remote_cache()
143219820Sjeff{
144219820Sjeff	# Find most updated remote cache file (the suffix should be like ip address: *.*.*.*)
145219820Sjeff	echo -n "$(/bin/ls -1t ${CACHE_FILE}.*.* 2> /dev/null | head -1)"
146219820Sjeff}
147219820Sjeff
148219820Sjeffget_largest_remote_cache()
149219820Sjeff{
150219820Sjeff	# Find largest (size) remote cache file (the suffix should be like ip address: *.*.*.*)
151219820Sjeff	echo -n "$(/bin/ls -1S ${CACHE_FILE}.*.* 2> /dev/null | head -1)"
152219820Sjeff}
153219820Sjeff
154219820Sjeffswap_cache_files()
155219820Sjeff{
156219820Sjeff	/bin/rm -f ${CACHE_FILE}.old
157219820Sjeff	/bin/mv ${CACHE_FILE} ${CACHE_FILE}.old
158219820Sjeff	/bin/cp ${largest_remote_cache} ${CACHE_FILE}
159219820Sjeff	touch ${CACHE_FILE}.tmp
160219820Sjeff}
161219820Sjeff
162219820Sjeff# Find local host in the osm hosts list
163219820Sjefflocal_host=""
164219820Sjefffor host in ${OSM_HOSTS}
165219820Sjeffdo
166219820Sjeff	if is_local $host; then
167219820Sjeff		local_host=${host}
168219820Sjeff	fi
169219820Sjeffdone
170219820Sjeff
171219820Sjeff# Get cache file info
172219820Sjeffdeclare -i new_size=0
173219820Sjeffdeclare -i last_size=0
174219820Sjeffdeclare -i largest_remote_cache_size=0
175219820Sjeff
176219820Sjeffif [ -e ${CACHE_FILE} ]; then
177219820Sjeff	last_size=$(du -b ${CACHE_FILE} | awk '{print$1}' | tr -d '[:space:]')
178219820Sjeffelse
179219820Sjeff	touch ${CACHE_FILE} ${CACHE_FILE}.tmp
180219820Sjefffi
181219820Sjeff
182219820Sjeff# if [ ${last_size} -gt 0 ]; then
183219820Sjeff# 	# First time update
184219820Sjeff# 	update_remote_cache
185219820Sjeff# fi
186219820Sjeff
187219820Sjeffwhile true
188219820Sjeffdo
189219820Sjeff	if [ -s "${CACHE_FILE}" ]; then
190219820Sjeff		new_size=$(du -b ${CACHE_FILE} | awk '{print$1}' | tr -d '[:space:]')
191219820Sjeff		# Check if local cache file grew from its last version or the time stamp changed
192219820Sjeff		if [ ${new_size} -gt ${last_size} ]
193219820Sjeff		   [ "$(/bin/ls -1t ${CACHE_FILE} ${CACHE_FILE}.tmp 2> /dev/null | head -1)"  != "${CACHE_FILE}.tmp" ]; then
194219820Sjeff			largest_remote_cache=$(get_largest_remote_cache)
195219820Sjeff			if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then
196219820Sjeff				largest_remote_cache_size=$(du -b ${largest_remote_cache} 2> /dev/null | awk '{print$1}' | tr -d '[:space:]')
197219820Sjeff			else
198219820Sjeff				largest_remote_cache_size=0
199219820Sjeff			fi
200219820Sjeff
201219820Sjeff			# Check if local cache file larger than remote chache file
202219820Sjeff			if [ ${new_size} -gt ${largest_remote_cache_size} ]; then
203219820Sjeff				[ $SLDD_DEBUG -eq 1 ] &&
204219820Sjeff				echo "Local cache file larger then remote. Update remote cache files"
205219820Sjeff				last_size=${new_size}
206219820Sjeff				update_remote_cache
207219820Sjeff				continue
208219820Sjeff			fi
209219820Sjeff		fi
210219820Sjeff
211219820Sjeff		largest_remote_cache=$(get_largest_remote_cache)
212219820Sjeff		if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then
213219820Sjeff			largest_remote_cache_size=$(du -b ${largest_remote_cache} 2> /dev/null | awk '{print$1}' | tr -d '[:space:]')
214219820Sjeff		else
215219820Sjeff			largest_remote_cache_size=0
216219820Sjeff		fi
217219820Sjeff
218219820Sjeff		# Update local cache file from remote
219219820Sjeff		if [ ${largest_remote_cache_size} -gt ${new_size} ]; then
220219820Sjeff			[ $SLDD_DEBUG -eq 1 ] &&
221219820Sjeff			echo "Local cache file shorter then remote. Use ${largest_remote_cache}"
222219820Sjeff			logger -i "SLDD: updating local cache file with ${largest_remote_cache}"
223219820Sjeff			swap_cache_files
224219820Sjeff			last_size=${largest_remote_cache_size}
225219820Sjeff		fi
226219820Sjeff
227219820Sjeff	else # The local cache file is empty
228219820Sjeff		[ $SLDD_DEBUG -eq 1 ] &&
229219820Sjeff		echo "${CACHE_FILE} is empty"
230219820Sjeff
231219820Sjeff		largest_remote_cache=$(get_largest_remote_cache)
232219820Sjeff		if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then
233219820Sjeff			# Copy it to the current cache
234219820Sjeff			[ $SLDD_DEBUG -eq 1 ] &&
235219820Sjeff			echo "Local cache file is empty. Use ${largest_remote_cache}"
236219820Sjeff			logger -i "SLDD: updating local cache file with ${largest_remote_cache}"
237219820Sjeff			swap_cache_files
238219820Sjeff		fi
239219820Sjeff
240219820Sjeff	fi
241219820Sjeff
242219820Sjeff	[ $SLDD_DEBUG -eq 1 ] &&
243219820Sjeff	echo "Sleeping ${RESCAN_TIME} seconds."
244219820Sjeff	sleep ${RESCAN_TIME}
245219820Sjeff
246219820Sjeffdone
247