1#!/bin/bash
2#
3# Copyright (c) 2008 Voltaire, Inc. All rights reserved.
4# Copyright (c) 2006 Mellanox Technologies. All rights reserved.
5#
6# This Software is licensed under one of the following licenses:
7#
8# 1) under the terms of the "Common Public License 1.0" a copy of which is
9#    available from the Open Source Initiative, see
10#    http://www.opensource.org/licenses/cpl.php.
11#
12# 2) under the terms of the "The BSD License" a copy of which is
13#    available from the Open Source Initiative, see
14#    http://www.opensource.org/licenses/bsd-license.php.
15#
16# 3) under the terms of the "GNU General Public License (GPL) Version 2" a
17#    copy of which is available from the Open Source Initiative, see
18#    http://www.opensource.org/licenses/gpl-license.php.
19#
20# Licensee has the right to choose one of the above licenses.
21#
22# Redistributions of source code must retain the above copyright
23# notice and one of the license notices.
24#
25# Redistributions in binary form must reproduce both the above copyright
26# notice, one of the license notices in the documentation
27# and/or other materials provided with the distribution.
28#
29#
30
31# OpenSM found to have the following problem
32# when handover is performed:
33# If some of the cluster nodes are rebooted during the handover they loose their LID assignment.
34# The reason for it is that the standby SM does not obey its own Guid to LID table
35# and simply uses the discovered LIDs. If some nodes are not available for it
36# their previous LID assignment is lost forever.
37
38# The idea is to use an external daemon that will distribute
39# the semi-static LID assignment table from the master SM to all standby SMs.
40# A standby SM, becoming a master . needs to obey the copied semi static LID assignment table.
41
42prefix=@prefix@
43exec_prefix=@exec_prefix@
44
45CONFIG=@sysconfdir@/sysconfig/opensm
46if [ -f $CONFIG ]; then
47	. $CONFIG
48fi
49
50SLDD_DEBUG=${SLDD_DEBUG:-0}
51
52CACHE_FILE=${CACHE_FILE:-/var/cache/opensm/guid2lid}
53CACHE_DIR=$(dirname ${CACHE_FILE})
54tmp_cache=${CACHE_FILE}.tmp
55
56PING='ping -w 1 -c 1'
57
58RCP=${RCP:-/usr/bin/scp}
59RSH=${RSH:-/usr/bin/ssh}
60IFCONFIG=${IFCONFIG:-'/sbin/ifconfig -a'}
61
62declare -i SLDD_DEBUG
63RESCAN_TIME=${RESCAN_TIME:-60}
64
65if [ -z "${OSM_HOSTS}" ]; then
66	[ $SLDD_DEBUG -eq 1 ] &&
67	echo "No OpenSM servers (OSM_HOSTS) configured for the IB subnet."
68	exit 0
69fi
70
71
72declare -a arr_OSM_HOSTS
73arr_OSM_HOSTS=(${OSM_HOSTS})
74
75num_of_osm_hosts=${#arr_OSM_HOSTS[@]}
76
77if [ ${num_of_osm_hosts} -eq 1 ]; then
78	[ $SLDD_DEBUG -eq 1 ] &&
79	echo "One OpenSM server configured in the IB subnet." &&
80	echo "Nothing to be done for SLDD"
81
82	exit 0
83fi
84
85trap 'trap_handler' 15
86
87trap_handler()
88{
89	logger -i "SLDD: Exiting."
90	exit 0
91}
92
93is_alive()
94{
95	$PING $1 > /dev/null 2>&1
96	return $?
97}
98
99is_local()
100{
101	$IFCONFIG | grep -w "$1" > /dev/null 2>&1
102	return $?
103}
104
105update_remote_cache()
106{
107	/bin/rm -f ${CACHE_FILE}.upd
108	/bin/cp -a ${CACHE_FILE} ${CACHE_FILE}.upd
109
110	[ $SLDD_DEBUG -eq 1 ] &&
111	echo "Updating remote cache file"
112
113	for host in ${OSM_HOSTS}
114	do
115		# Skip local host update
116		if [ "${host}" == "${local_host}" ]; then
117			continue
118		fi
119
120		if is_alive $host; then
121			stat=$($RSH $host "/bin/mkdir -p ${CACHE_DIR} > /dev/null 2>&1; /bin/rm -f ${CACHE_FILE}.${local_host} > /dev/null 2>&1; echo \$?" | tr -d '[:space:]')
122			if [ "X${stat}" == "X0" ]; then
123				[ $SLDD_DEBUG -eq 1 ] &&
124				echo "Updating $host"
125				logger -i "SLDD: updating $host with ${CACHE_FILE}"
126				$RCP ${CACHE_FILE}.upd ${host}:${CACHE_FILE}.${local_host}
127				/bin/cp ${CACHE_FILE}.upd ${CACHE_FILE}.${host}
128			else
129				[ $SLDD_DEBUG -eq 1 ] &&
130				echo "$RSH to $host failed."
131				logger -i "SLDD: Failed to update $host with ${CACHE_FILE}. $RSH without password should be enabled"
132				exit 5
133			fi
134		else
135			[ $SLDD_DEBUG -eq 1 ] &&
136			echo "$host is down."
137			continue
138		fi
139	done
140}
141
142get_latest_remote_cache()
143{
144	# Find most updated remote cache file (the suffix should be like ip address: *.*.*.*)
145	echo -n "$(/bin/ls -1t ${CACHE_FILE}.*.* 2> /dev/null | head -1)"
146}
147
148get_largest_remote_cache()
149{
150	# Find largest (size) remote cache file (the suffix should be like ip address: *.*.*.*)
151	echo -n "$(/bin/ls -1S ${CACHE_FILE}.*.* 2> /dev/null | head -1)"
152}
153
154swap_cache_files()
155{
156	/bin/rm -f ${CACHE_FILE}.old
157	/bin/mv ${CACHE_FILE} ${CACHE_FILE}.old
158	/bin/cp ${largest_remote_cache} ${CACHE_FILE}
159	touch ${CACHE_FILE}.tmp
160}
161
162# Find local host in the osm hosts list
163local_host=""
164for host in ${OSM_HOSTS}
165do
166	if is_local $host; then
167		local_host=${host}
168	fi
169done
170
171# Get cache file info
172declare -i new_size=0
173declare -i last_size=0
174declare -i largest_remote_cache_size=0
175
176if [ -e ${CACHE_FILE} ]; then
177	last_size=$(du -b ${CACHE_FILE} | awk '{print$1}' | tr -d '[:space:]')
178else
179	touch ${CACHE_FILE} ${CACHE_FILE}.tmp
180fi
181
182# if [ ${last_size} -gt 0 ]; then
183# 	# First time update
184# 	update_remote_cache
185# fi
186
187while true
188do
189	if [ -s "${CACHE_FILE}" ]; then
190		new_size=$(du -b ${CACHE_FILE} | awk '{print$1}' | tr -d '[:space:]')
191		# Check if local cache file grew from its last version or the time stamp changed
192		if [ ${new_size} -gt ${last_size} ]
193		   [ "$(/bin/ls -1t ${CACHE_FILE} ${CACHE_FILE}.tmp 2> /dev/null | head -1)"  != "${CACHE_FILE}.tmp" ]; then
194			largest_remote_cache=$(get_largest_remote_cache)
195			if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then
196				largest_remote_cache_size=$(du -b ${largest_remote_cache} 2> /dev/null | awk '{print$1}' | tr -d '[:space:]')
197			else
198				largest_remote_cache_size=0
199			fi
200
201			# Check if local cache file larger than remote chache file
202			if [ ${new_size} -gt ${largest_remote_cache_size} ]; then
203				[ $SLDD_DEBUG -eq 1 ] &&
204				echo "Local cache file larger then remote. Update remote cache files"
205				last_size=${new_size}
206				update_remote_cache
207				continue
208			fi
209		fi
210
211		largest_remote_cache=$(get_largest_remote_cache)
212		if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then
213			largest_remote_cache_size=$(du -b ${largest_remote_cache} 2> /dev/null | awk '{print$1}' | tr -d '[:space:]')
214		else
215			largest_remote_cache_size=0
216		fi
217
218		# Update local cache file from remote
219		if [ ${largest_remote_cache_size} -gt ${new_size} ]; then
220			[ $SLDD_DEBUG -eq 1 ] &&
221			echo "Local cache file shorter then remote. Use ${largest_remote_cache}"
222			logger -i "SLDD: updating local cache file with ${largest_remote_cache}"
223			swap_cache_files
224			last_size=${largest_remote_cache_size}
225		fi
226
227	else # The local cache file is empty
228		[ $SLDD_DEBUG -eq 1 ] &&
229		echo "${CACHE_FILE} is empty"
230
231		largest_remote_cache=$(get_largest_remote_cache)
232		if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then
233			# Copy it to the current cache
234			[ $SLDD_DEBUG -eq 1 ] &&
235			echo "Local cache file is empty. Use ${largest_remote_cache}"
236			logger -i "SLDD: updating local cache file with ${largest_remote_cache}"
237			swap_cache_files
238		fi
239
240	fi
241
242	[ $SLDD_DEBUG -eq 1 ] &&
243	echo "Sleeping ${RESCAN_TIME} seconds."
244	sleep ${RESCAN_TIME}
245
246done
247