1#!/bin/bash 2# 3# Copyright (c) 2008 Voltaire, Inc. All rights reserved. 4# Copyright (c) 2006 Mellanox Technologies. All rights reserved. 5# 6# This Software is licensed under one of the following licenses: 7# 8# 1) under the terms of the "Common Public License 1.0" a copy of which is 9# available from the Open Source Initiative, see 10# http://www.opensource.org/licenses/cpl.php. 11# 12# 2) under the terms of the "The BSD License" a copy of which is 13# available from the Open Source Initiative, see 14# http://www.opensource.org/licenses/bsd-license.php. 15# 16# 3) under the terms of the "GNU General Public License (GPL) Version 2" a 17# copy of which is available from the Open Source Initiative, see 18# http://www.opensource.org/licenses/gpl-license.php. 19# 20# Licensee has the right to choose one of the above licenses. 21# 22# Redistributions of source code must retain the above copyright 23# notice and one of the license notices. 24# 25# Redistributions in binary form must reproduce both the above copyright 26# notice, one of the license notices in the documentation 27# and/or other materials provided with the distribution. 28# 29# 30 31# OpenSM found to have the following problem 32# when handover is performed: 33# If some of the cluster nodes are rebooted during the handover they loose their LID assignment. 34# The reason for it is that the standby SM does not obey its own Guid to LID table 35# and simply uses the discovered LIDs. If some nodes are not available for it 36# their previous LID assignment is lost forever. 37 38# The idea is to use an external daemon that will distribute 39# the semi-static LID assignment table from the master SM to all standby SMs. 40# A standby SM, becoming a master . needs to obey the copied semi static LID assignment table. 41 42prefix=@prefix@ 43exec_prefix=@exec_prefix@ 44 45CONFIG=@sysconfdir@/sysconfig/opensm 46if [ -f $CONFIG ]; then 47 . $CONFIG 48fi 49 50SLDD_DEBUG=${SLDD_DEBUG:-0} 51 52CACHE_FILE=${CACHE_FILE:-/var/cache/opensm/guid2lid} 53CACHE_DIR=$(dirname ${CACHE_FILE}) 54tmp_cache=${CACHE_FILE}.tmp 55 56PING='ping -w 1 -c 1' 57 58RCP=${RCP:-/usr/bin/scp} 59RSH=${RSH:-/usr/bin/ssh} 60IFCONFIG=${IFCONFIG:-'/sbin/ifconfig -a'} 61 62declare -i SLDD_DEBUG 63RESCAN_TIME=${RESCAN_TIME:-60} 64 65if [ -z "${OSM_HOSTS}" ]; then 66 [ $SLDD_DEBUG -eq 1 ] && 67 echo "No OpenSM servers (OSM_HOSTS) configured for the IB subnet." 68 exit 0 69fi 70 71 72declare -a arr_OSM_HOSTS 73arr_OSM_HOSTS=(${OSM_HOSTS}) 74 75num_of_osm_hosts=${#arr_OSM_HOSTS[@]} 76 77if [ ${num_of_osm_hosts} -eq 1 ]; then 78 [ $SLDD_DEBUG -eq 1 ] && 79 echo "One OpenSM server configured in the IB subnet." && 80 echo "Nothing to be done for SLDD" 81 82 exit 0 83fi 84 85trap 'trap_handler' 15 86 87trap_handler() 88{ 89 logger -i "SLDD: Exiting." 90 exit 0 91} 92 93is_alive() 94{ 95 $PING $1 > /dev/null 2>&1 96 return $? 97} 98 99is_local() 100{ 101 $IFCONFIG | grep -w "$1" > /dev/null 2>&1 102 return $? 103} 104 105update_remote_cache() 106{ 107 /bin/rm -f ${CACHE_FILE}.upd 108 /bin/cp -a ${CACHE_FILE} ${CACHE_FILE}.upd 109 110 [ $SLDD_DEBUG -eq 1 ] && 111 echo "Updating remote cache file" 112 113 for host in ${OSM_HOSTS} 114 do 115 # Skip local host update 116 if [ "${host}" == "${local_host}" ]; then 117 continue 118 fi 119 120 if is_alive $host; then 121 stat=$($RSH $host "/bin/mkdir -p ${CACHE_DIR} > /dev/null 2>&1; /bin/rm -f ${CACHE_FILE}.${local_host} > /dev/null 2>&1; echo \$?" | tr -d '[:space:]') 122 if [ "X${stat}" == "X0" ]; then 123 [ $SLDD_DEBUG -eq 1 ] && 124 echo "Updating $host" 125 logger -i "SLDD: updating $host with ${CACHE_FILE}" 126 $RCP ${CACHE_FILE}.upd ${host}:${CACHE_FILE}.${local_host} 127 /bin/cp ${CACHE_FILE}.upd ${CACHE_FILE}.${host} 128 else 129 [ $SLDD_DEBUG -eq 1 ] && 130 echo "$RSH to $host failed." 131 logger -i "SLDD: Failed to update $host with ${CACHE_FILE}. $RSH without password should be enabled" 132 exit 5 133 fi 134 else 135 [ $SLDD_DEBUG -eq 1 ] && 136 echo "$host is down." 137 continue 138 fi 139 done 140} 141 142get_latest_remote_cache() 143{ 144 # Find most updated remote cache file (the suffix should be like ip address: *.*.*.*) 145 echo -n "$(/bin/ls -1t ${CACHE_FILE}.*.* 2> /dev/null | head -1)" 146} 147 148get_largest_remote_cache() 149{ 150 # Find largest (size) remote cache file (the suffix should be like ip address: *.*.*.*) 151 echo -n "$(/bin/ls -1S ${CACHE_FILE}.*.* 2> /dev/null | head -1)" 152} 153 154swap_cache_files() 155{ 156 /bin/rm -f ${CACHE_FILE}.old 157 /bin/mv ${CACHE_FILE} ${CACHE_FILE}.old 158 /bin/cp ${largest_remote_cache} ${CACHE_FILE} 159 touch ${CACHE_FILE}.tmp 160} 161 162# Find local host in the osm hosts list 163local_host="" 164for host in ${OSM_HOSTS} 165do 166 if is_local $host; then 167 local_host=${host} 168 fi 169done 170 171# Get cache file info 172declare -i new_size=0 173declare -i last_size=0 174declare -i largest_remote_cache_size=0 175 176if [ -e ${CACHE_FILE} ]; then 177 last_size=$(du -b ${CACHE_FILE} | awk '{print$1}' | tr -d '[:space:]') 178else 179 touch ${CACHE_FILE} ${CACHE_FILE}.tmp 180fi 181 182# if [ ${last_size} -gt 0 ]; then 183# # First time update 184# update_remote_cache 185# fi 186 187while true 188do 189 if [ -s "${CACHE_FILE}" ]; then 190 new_size=$(du -b ${CACHE_FILE} | awk '{print$1}' | tr -d '[:space:]') 191 # Check if local cache file grew from its last version or the time stamp changed 192 if [ ${new_size} -gt ${last_size} ] 193 [ "$(/bin/ls -1t ${CACHE_FILE} ${CACHE_FILE}.tmp 2> /dev/null | head -1)" != "${CACHE_FILE}.tmp" ]; then 194 largest_remote_cache=$(get_largest_remote_cache) 195 if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then 196 largest_remote_cache_size=$(du -b ${largest_remote_cache} 2> /dev/null | awk '{print$1}' | tr -d '[:space:]') 197 else 198 largest_remote_cache_size=0 199 fi 200 201 # Check if local cache file larger than remote chache file 202 if [ ${new_size} -gt ${largest_remote_cache_size} ]; then 203 [ $SLDD_DEBUG -eq 1 ] && 204 echo "Local cache file larger then remote. Update remote cache files" 205 last_size=${new_size} 206 update_remote_cache 207 continue 208 fi 209 fi 210 211 largest_remote_cache=$(get_largest_remote_cache) 212 if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then 213 largest_remote_cache_size=$(du -b ${largest_remote_cache} 2> /dev/null | awk '{print$1}' | tr -d '[:space:]') 214 else 215 largest_remote_cache_size=0 216 fi 217 218 # Update local cache file from remote 219 if [ ${largest_remote_cache_size} -gt ${new_size} ]; then 220 [ $SLDD_DEBUG -eq 1 ] && 221 echo "Local cache file shorter then remote. Use ${largest_remote_cache}" 222 logger -i "SLDD: updating local cache file with ${largest_remote_cache}" 223 swap_cache_files 224 last_size=${largest_remote_cache_size} 225 fi 226 227 else # The local cache file is empty 228 [ $SLDD_DEBUG -eq 1 ] && 229 echo "${CACHE_FILE} is empty" 230 231 largest_remote_cache=$(get_largest_remote_cache) 232 if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then 233 # Copy it to the current cache 234 [ $SLDD_DEBUG -eq 1 ] && 235 echo "Local cache file is empty. Use ${largest_remote_cache}" 236 logger -i "SLDD: updating local cache file with ${largest_remote_cache}" 237 swap_cache_files 238 fi 239 240 fi 241 242 [ $SLDD_DEBUG -eq 1 ] && 243 echo "Sleeping ${RESCAN_TIME} seconds." 244 sleep ${RESCAN_TIME} 245 246done 247