1219820Sjeff#!/bin/bash 2219820Sjeff# 3219820Sjeff# Copyright (c) 2008 Voltaire, Inc. All rights reserved. 4219820Sjeff# Copyright (c) 2006 Mellanox Technologies. All rights reserved. 5219820Sjeff# 6219820Sjeff# This Software is licensed under one of the following licenses: 7219820Sjeff# 8219820Sjeff# 1) under the terms of the "Common Public License 1.0" a copy of which is 9219820Sjeff# available from the Open Source Initiative, see 10219820Sjeff# http://www.opensource.org/licenses/cpl.php. 11219820Sjeff# 12219820Sjeff# 2) under the terms of the "The BSD License" a copy of which is 13219820Sjeff# available from the Open Source Initiative, see 14219820Sjeff# http://www.opensource.org/licenses/bsd-license.php. 15219820Sjeff# 16219820Sjeff# 3) under the terms of the "GNU General Public License (GPL) Version 2" a 17219820Sjeff# copy of which is available from the Open Source Initiative, see 18219820Sjeff# http://www.opensource.org/licenses/gpl-license.php. 19219820Sjeff# 20219820Sjeff# Licensee has the right to choose one of the above licenses. 21219820Sjeff# 22219820Sjeff# Redistributions of source code must retain the above copyright 23219820Sjeff# notice and one of the license notices. 24219820Sjeff# 25219820Sjeff# Redistributions in binary form must reproduce both the above copyright 26219820Sjeff# notice, one of the license notices in the documentation 27219820Sjeff# and/or other materials provided with the distribution. 28219820Sjeff# 29219820Sjeff# 30219820Sjeff 31219820Sjeff# OpenSM found to have the following problem 32219820Sjeff# when handover is performed: 33219820Sjeff# If some of the cluster nodes are rebooted during the handover they loose their LID assignment. 34219820Sjeff# The reason for it is that the standby SM does not obey its own Guid to LID table 35219820Sjeff# and simply uses the discovered LIDs. If some nodes are not available for it 36219820Sjeff# their previous LID assignment is lost forever. 37219820Sjeff 38219820Sjeff# The idea is to use an external daemon that will distribute 39219820Sjeff# the semi-static LID assignment table from the master SM to all standby SMs. 40219820Sjeff# A standby SM, becoming a master . needs to obey the copied semi static LID assignment table. 41219820Sjeff 42219820Sjeffprefix=@prefix@ 43219820Sjeffexec_prefix=@exec_prefix@ 44219820Sjeff 45219820SjeffCONFIG=@sysconfdir@/sysconfig/opensm 46219820Sjeffif [ -f $CONFIG ]; then 47219820Sjeff . $CONFIG 48219820Sjefffi 49219820Sjeff 50219820SjeffSLDD_DEBUG=${SLDD_DEBUG:-0} 51219820Sjeff 52219820SjeffCACHE_FILE=${CACHE_FILE:-/var/cache/opensm/guid2lid} 53219820SjeffCACHE_DIR=$(dirname ${CACHE_FILE}) 54219820Sjefftmp_cache=${CACHE_FILE}.tmp 55219820Sjeff 56219820SjeffPING='ping -w 1 -c 1' 57219820Sjeff 58219820SjeffRCP=${RCP:-/usr/bin/scp} 59219820SjeffRSH=${RSH:-/usr/bin/ssh} 60219820SjeffIFCONFIG=${IFCONFIG:-'/sbin/ifconfig -a'} 61219820Sjeff 62219820Sjeffdeclare -i SLDD_DEBUG 63219820SjeffRESCAN_TIME=${RESCAN_TIME:-60} 64219820Sjeff 65219820Sjeffif [ -z "${OSM_HOSTS}" ]; then 66219820Sjeff [ $SLDD_DEBUG -eq 1 ] && 67219820Sjeff echo "No OpenSM servers (OSM_HOSTS) configured for the IB subnet." 68219820Sjeff exit 0 69219820Sjefffi 70219820Sjeff 71219820Sjeff 72219820Sjeffdeclare -a arr_OSM_HOSTS 73219820Sjeffarr_OSM_HOSTS=(${OSM_HOSTS}) 74219820Sjeff 75219820Sjeffnum_of_osm_hosts=${#arr_OSM_HOSTS[@]} 76219820Sjeff 77219820Sjeffif [ ${num_of_osm_hosts} -eq 1 ]; then 78219820Sjeff [ $SLDD_DEBUG -eq 1 ] && 79219820Sjeff echo "One OpenSM server configured in the IB subnet." && 80219820Sjeff echo "Nothing to be done for SLDD" 81219820Sjeff 82219820Sjeff exit 0 83219820Sjefffi 84219820Sjeff 85219820Sjefftrap 'trap_handler' 15 86219820Sjeff 87219820Sjefftrap_handler() 88219820Sjeff{ 89219820Sjeff logger -i "SLDD: Exiting." 90219820Sjeff exit 0 91219820Sjeff} 92219820Sjeff 93219820Sjeffis_alive() 94219820Sjeff{ 95219820Sjeff $PING $1 > /dev/null 2>&1 96219820Sjeff return $? 97219820Sjeff} 98219820Sjeff 99219820Sjeffis_local() 100219820Sjeff{ 101219820Sjeff $IFCONFIG | grep -w "$1" > /dev/null 2>&1 102219820Sjeff return $? 103219820Sjeff} 104219820Sjeff 105219820Sjeffupdate_remote_cache() 106219820Sjeff{ 107219820Sjeff /bin/rm -f ${CACHE_FILE}.upd 108219820Sjeff /bin/cp -a ${CACHE_FILE} ${CACHE_FILE}.upd 109219820Sjeff 110219820Sjeff [ $SLDD_DEBUG -eq 1 ] && 111219820Sjeff echo "Updating remote cache file" 112219820Sjeff 113219820Sjeff for host in ${OSM_HOSTS} 114219820Sjeff do 115219820Sjeff # Skip local host update 116219820Sjeff if [ "${host}" == "${local_host}" ]; then 117219820Sjeff continue 118219820Sjeff fi 119219820Sjeff 120219820Sjeff if is_alive $host; then 121219820Sjeff stat=$($RSH $host "/bin/mkdir -p ${CACHE_DIR} > /dev/null 2>&1; /bin/rm -f ${CACHE_FILE}.${local_host} > /dev/null 2>&1; echo \$?" | tr -d '[:space:]') 122219820Sjeff if [ "X${stat}" == "X0" ]; then 123219820Sjeff [ $SLDD_DEBUG -eq 1 ] && 124219820Sjeff echo "Updating $host" 125219820Sjeff logger -i "SLDD: updating $host with ${CACHE_FILE}" 126219820Sjeff $RCP ${CACHE_FILE}.upd ${host}:${CACHE_FILE}.${local_host} 127219820Sjeff /bin/cp ${CACHE_FILE}.upd ${CACHE_FILE}.${host} 128219820Sjeff else 129219820Sjeff [ $SLDD_DEBUG -eq 1 ] && 130219820Sjeff echo "$RSH to $host failed." 131219820Sjeff logger -i "SLDD: Failed to update $host with ${CACHE_FILE}. $RSH without password should be enabled" 132219820Sjeff exit 5 133219820Sjeff fi 134219820Sjeff else 135219820Sjeff [ $SLDD_DEBUG -eq 1 ] && 136219820Sjeff echo "$host is down." 137219820Sjeff continue 138219820Sjeff fi 139219820Sjeff done 140219820Sjeff} 141219820Sjeff 142219820Sjeffget_latest_remote_cache() 143219820Sjeff{ 144219820Sjeff # Find most updated remote cache file (the suffix should be like ip address: *.*.*.*) 145219820Sjeff echo -n "$(/bin/ls -1t ${CACHE_FILE}.*.* 2> /dev/null | head -1)" 146219820Sjeff} 147219820Sjeff 148219820Sjeffget_largest_remote_cache() 149219820Sjeff{ 150219820Sjeff # Find largest (size) remote cache file (the suffix should be like ip address: *.*.*.*) 151219820Sjeff echo -n "$(/bin/ls -1S ${CACHE_FILE}.*.* 2> /dev/null | head -1)" 152219820Sjeff} 153219820Sjeff 154219820Sjeffswap_cache_files() 155219820Sjeff{ 156219820Sjeff /bin/rm -f ${CACHE_FILE}.old 157219820Sjeff /bin/mv ${CACHE_FILE} ${CACHE_FILE}.old 158219820Sjeff /bin/cp ${largest_remote_cache} ${CACHE_FILE} 159219820Sjeff touch ${CACHE_FILE}.tmp 160219820Sjeff} 161219820Sjeff 162219820Sjeff# Find local host in the osm hosts list 163219820Sjefflocal_host="" 164219820Sjefffor host in ${OSM_HOSTS} 165219820Sjeffdo 166219820Sjeff if is_local $host; then 167219820Sjeff local_host=${host} 168219820Sjeff fi 169219820Sjeffdone 170219820Sjeff 171219820Sjeff# Get cache file info 172219820Sjeffdeclare -i new_size=0 173219820Sjeffdeclare -i last_size=0 174219820Sjeffdeclare -i largest_remote_cache_size=0 175219820Sjeff 176219820Sjeffif [ -e ${CACHE_FILE} ]; then 177219820Sjeff last_size=$(du -b ${CACHE_FILE} | awk '{print$1}' | tr -d '[:space:]') 178219820Sjeffelse 179219820Sjeff touch ${CACHE_FILE} ${CACHE_FILE}.tmp 180219820Sjefffi 181219820Sjeff 182219820Sjeff# if [ ${last_size} -gt 0 ]; then 183219820Sjeff# # First time update 184219820Sjeff# update_remote_cache 185219820Sjeff# fi 186219820Sjeff 187219820Sjeffwhile true 188219820Sjeffdo 189219820Sjeff if [ -s "${CACHE_FILE}" ]; then 190219820Sjeff new_size=$(du -b ${CACHE_FILE} | awk '{print$1}' | tr -d '[:space:]') 191219820Sjeff # Check if local cache file grew from its last version or the time stamp changed 192219820Sjeff if [ ${new_size} -gt ${last_size} ] 193219820Sjeff [ "$(/bin/ls -1t ${CACHE_FILE} ${CACHE_FILE}.tmp 2> /dev/null | head -1)" != "${CACHE_FILE}.tmp" ]; then 194219820Sjeff largest_remote_cache=$(get_largest_remote_cache) 195219820Sjeff if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then 196219820Sjeff largest_remote_cache_size=$(du -b ${largest_remote_cache} 2> /dev/null | awk '{print$1}' | tr -d '[:space:]') 197219820Sjeff else 198219820Sjeff largest_remote_cache_size=0 199219820Sjeff fi 200219820Sjeff 201219820Sjeff # Check if local cache file larger than remote chache file 202219820Sjeff if [ ${new_size} -gt ${largest_remote_cache_size} ]; then 203219820Sjeff [ $SLDD_DEBUG -eq 1 ] && 204219820Sjeff echo "Local cache file larger then remote. Update remote cache files" 205219820Sjeff last_size=${new_size} 206219820Sjeff update_remote_cache 207219820Sjeff continue 208219820Sjeff fi 209219820Sjeff fi 210219820Sjeff 211219820Sjeff largest_remote_cache=$(get_largest_remote_cache) 212219820Sjeff if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then 213219820Sjeff largest_remote_cache_size=$(du -b ${largest_remote_cache} 2> /dev/null | awk '{print$1}' | tr -d '[:space:]') 214219820Sjeff else 215219820Sjeff largest_remote_cache_size=0 216219820Sjeff fi 217219820Sjeff 218219820Sjeff # Update local cache file from remote 219219820Sjeff if [ ${largest_remote_cache_size} -gt ${new_size} ]; then 220219820Sjeff [ $SLDD_DEBUG -eq 1 ] && 221219820Sjeff echo "Local cache file shorter then remote. Use ${largest_remote_cache}" 222219820Sjeff logger -i "SLDD: updating local cache file with ${largest_remote_cache}" 223219820Sjeff swap_cache_files 224219820Sjeff last_size=${largest_remote_cache_size} 225219820Sjeff fi 226219820Sjeff 227219820Sjeff else # The local cache file is empty 228219820Sjeff [ $SLDD_DEBUG -eq 1 ] && 229219820Sjeff echo "${CACHE_FILE} is empty" 230219820Sjeff 231219820Sjeff largest_remote_cache=$(get_largest_remote_cache) 232219820Sjeff if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then 233219820Sjeff # Copy it to the current cache 234219820Sjeff [ $SLDD_DEBUG -eq 1 ] && 235219820Sjeff echo "Local cache file is empty. Use ${largest_remote_cache}" 236219820Sjeff logger -i "SLDD: updating local cache file with ${largest_remote_cache}" 237219820Sjeff swap_cache_files 238219820Sjeff fi 239219820Sjeff 240219820Sjeff fi 241219820Sjeff 242219820Sjeff [ $SLDD_DEBUG -eq 1 ] && 243219820Sjeff echo "Sleeping ${RESCAN_TIME} seconds." 244219820Sjeff sleep ${RESCAN_TIME} 245219820Sjeff 246219820Sjeffdone 247