/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include #include /* dtrace is S10 or later */ #include "sd_bcache.h" #include "sd_trace.h" #include "sd_io.h" #include "sd_bio.h" #include "sd_ft.h" #include "sd_misc.h" #include "sd_pcu.h" #include #include #include #include #ifndef DS_DDICT #include #endif /* * kstat interface */ static kstat_t *sdbc_global_stats_kstat; static int sdbc_global_stats_update(kstat_t *ksp, int rw); typedef struct { kstat_named_t ci_sdbc_count; kstat_named_t ci_sdbc_loc_count; kstat_named_t ci_sdbc_rdhits; kstat_named_t ci_sdbc_rdmiss; kstat_named_t ci_sdbc_wrhits; kstat_named_t ci_sdbc_wrmiss; kstat_named_t ci_sdbc_blksize; kstat_named_t ci_sdbc_lru_blocks; #ifdef DEBUG kstat_named_t ci_sdbc_lru_noreq; kstat_named_t ci_sdbc_lru_req; #endif kstat_named_t ci_sdbc_wlru_inq; kstat_named_t ci_sdbc_cachesize; kstat_named_t ci_sdbc_numblocks; kstat_named_t ci_sdbc_num_shared; kstat_named_t ci_sdbc_wrcancelns; kstat_named_t ci_sdbc_destaged; kstat_named_t ci_sdbc_nodehints; } sdbc_global_stats_t; static sdbc_global_stats_t sdbc_global_stats = { {SDBC_GKSTAT_COUNT, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_LOC_COUNT, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_RDHITS, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_RDMISS, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_WRHITS, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_WRMISS, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_BLKSIZE, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_LRU_BLOCKS, KSTAT_DATA_ULONG}, #ifdef DEBUG {SDBC_GKSTAT_LRU_NOREQ, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_LRU_REQ, KSTAT_DATA_ULONG}, #endif {SDBC_GKSTAT_WLRU_INQ, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_CACHESIZE, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_NUMBLOCKS, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_NUM_SHARED, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_WRCANCELNS, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_DESTAGED, KSTAT_DATA_ULONG}, {SDBC_GKSTAT_NODEHINTS, KSTAT_DATA_ULONG}, }; static kstat_t **sdbc_cd_kstats; static kstat_t **sdbc_cd_io_kstats; static kmutex_t *sdbc_cd_io_kstats_mutexes; static kstat_t *sdbc_global_io_kstat; static kmutex_t sdbc_global_io_kstat_mutex; static int sdbc_cd_stats_update(kstat_t *ksp, int rw); static int cd_kstat_add(int cd); static int cd_kstat_remove(int cd); typedef struct { kstat_named_t ci_sdbc_vol_name; kstat_named_t ci_sdbc_failed; kstat_named_t ci_sdbc_cd; kstat_named_t ci_sdbc_cache_read; kstat_named_t ci_sdbc_cache_write; kstat_named_t ci_sdbc_disk_read; kstat_named_t ci_sdbc_disk_write; kstat_named_t ci_sdbc_filesize; kstat_named_t ci_sdbc_numdirty; kstat_named_t ci_sdbc_numio; kstat_named_t ci_sdbc_numfail; kstat_named_t ci_sdbc_destaged; kstat_named_t ci_sdbc_wrcancelns; kstat_named_t ci_sdbc_cdhints; } sdbc_cd_stats_t; static sdbc_cd_stats_t sdbc_cd_stats = { {SDBC_CDKSTAT_VOL_NAME, KSTAT_DATA_CHAR}, {SDBC_CDKSTAT_FAILED, KSTAT_DATA_ULONG}, {SDBC_CDKSTAT_CD, KSTAT_DATA_ULONG}, {SDBC_CDKSTAT_CACHE_READ, KSTAT_DATA_ULONG}, {SDBC_CDKSTAT_CACHE_WRITE, KSTAT_DATA_ULONG}, {SDBC_CDKSTAT_DISK_READ, KSTAT_DATA_ULONG}, {SDBC_CDKSTAT_DISK_WRITE, KSTAT_DATA_ULONG}, #ifdef NSC_MULTI_TERABYTE {SDBC_CDKSTAT_FILESIZE, KSTAT_DATA_UINT64}, #else {SDBC_CDKSTAT_FILESIZE, KSTAT_DATA_ULONG}, #endif {SDBC_CDKSTAT_NUMDIRTY, KSTAT_DATA_ULONG}, {SDBC_CDKSTAT_NUMIO, KSTAT_DATA_ULONG}, {SDBC_CDKSTAT_NUMFAIL, KSTAT_DATA_ULONG}, {SDBC_CDKSTAT_DESTAGED, KSTAT_DATA_ULONG}, {SDBC_CDKSTAT_WRCANCELNS, KSTAT_DATA_ULONG}, {SDBC_CDKSTAT_CDHINTS, KSTAT_DATA_ULONG}, }; #ifdef DEBUG /* * dynmem kstat interface */ static kstat_t *sdbc_dynmem_kstat_dm; static int simplect_dm; static int sdbc_dynmem_kstat_update_dm(kstat_t *ksp, int rw); typedef struct { kstat_named_t ci_sdbc_monitor_dynmem; kstat_named_t ci_sdbc_max_dyn_list; kstat_named_t ci_sdbc_cache_aging_ct1; kstat_named_t ci_sdbc_cache_aging_ct2; kstat_named_t ci_sdbc_cache_aging_ct3; kstat_named_t ci_sdbc_cache_aging_sec1; kstat_named_t ci_sdbc_cache_aging_sec2; kstat_named_t ci_sdbc_cache_aging_sec3; kstat_named_t ci_sdbc_cache_aging_pcnt1; kstat_named_t ci_sdbc_cache_aging_pcnt2; kstat_named_t ci_sdbc_max_holds_pcnt; kstat_named_t ci_sdbc_alloc_ct; kstat_named_t ci_sdbc_dealloc_ct; kstat_named_t ci_sdbc_history; kstat_named_t ci_sdbc_nodatas; kstat_named_t ci_sdbc_candidates; kstat_named_t ci_sdbc_deallocs; kstat_named_t ci_sdbc_hosts; kstat_named_t ci_sdbc_pests; kstat_named_t ci_sdbc_metas; kstat_named_t ci_sdbc_holds; kstat_named_t ci_sdbc_others; kstat_named_t ci_sdbc_notavail; kstat_named_t ci_sdbc_process_directive; kstat_named_t ci_sdbc_simplect; } sdbc_dynmem_dm_t; static sdbc_dynmem_dm_t sdbc_dynmem_dm = { {SDBC_DMKSTAT_MONITOR_DYNMEM, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_MAX_DYN_LIST, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_CACHE_AGING_CT1, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_CACHE_AGING_CT2, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_CACHE_AGING_CT3, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_CACHE_AGING_SEC1, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_CACHE_AGING_SEC2, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_CACHE_AGING_SEC3, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_CACHE_AGING_PCNT1, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_CACHE_AGING_PCNT2, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_MAX_HOLDS_PCNT, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_ALLOC_CNT, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_DEALLOC_CNT, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_HISTORY, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_NODATAS, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_CANDIDATES, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_DEALLOCS, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_HOSTS, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_PESTS, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_METAS, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_HOLDS, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_OTHERS, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_NOTAVAIL, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_PROCESS_DIRECTIVE, KSTAT_DATA_ULONG}, {SDBC_DMKSTAT_SIMPLECT, KSTAT_DATA_ULONG} }; #endif /* End of dynmem kstats */ #ifdef DEBUG int *dmchainpull_table; /* dmchain wastage stats */ #endif /* * dynmem process vars */ extern _dm_process_vars_t dynmem_processing_dm; /* metadata for volumes */ ss_voldata_t *_sdbc_gl_file_info; size_t _sdbc_gl_file_info_size; /* metadata for cache write blocks */ static ss_centry_info_t *_sdbc_gl_centry_info; /* wblocks * sizeof(ss_centry_info_t) */ static size_t _sdbc_gl_centry_info_size; static int _SD_DELAY_QUEUE = 1; static int sdbc_allocb_inuse, sdbc_allocb_lost, sdbc_allocb_hit; static int sdbc_allocb_pageio1, sdbc_allocb_pageio2; static int sdbc_centry_hit, sdbc_centry_inuse, sdbc_centry_lost; static int sdbc_dmchain_not_avail; static int sdbc_allocb_deallocd; static int sdbc_centry_deallocd; static int sdbc_check_cot; static int sdbc_ra_hash; /* 1-block read-ahead fails due to hash hit */ static int sdbc_ra_none; /* 1-block read-ahead fails due to "would block" */ /* * Set the following variable to 1 to enable pagelist io mutual * exclusion on all _sd_alloc_buf() operations. * * This is set to ON to prevent front end / back end races between new * NSC_WRTHRU io operations coming in through _sd_alloc_buf(), and * previously written data being flushed out to disk by the sdbc * flusher at the back end. * -- see bugtraq 4287564 * -- Simon Crosland, Mon Nov 8 16:34:09 GMT 1999 */ static int sdbc_pageio_always = 1; int sdbc_use_dmchain = 0; /* start time switch for dm chaining */ int sdbc_prefetch1 = 1; /* do 1-block read-ahead */ /* * if sdbc_static_cache is 1 allocate all cache memory at startup. * deallocate only at shutdown. */ int sdbc_static_cache = 1; #ifdef DEBUG /* * Pagelist io mutual exclusion debug facility. */ #define SDBC_PAGEIO_OFF 0 /* no debug */ #define SDBC_PAGEIO_RDEV 1 /* force NSC_PAGEIO for specified dev */ #define SDBC_PAGEIO_RAND 2 /* randomly force NSC_PAGEIO */ #define SDBC_PAGEIO_ALL 3 /* always force NSC_PAGEIO */ static int sdbc_pageio_debug = SDBC_PAGEIO_OFF; static dev_t sdbc_pageio_rdev = (dev_t)-1; #endif /* * INF SD cache global data */ _sd_cd_info_t *_sd_cache_files; _sd_stats_t *_sd_cache_stats; kmutex_t _sd_cache_lock; _sd_hash_table_t *_sd_htable; _sd_queue_t _sd_lru_q; _sd_cctl_t *_sd_cctl[_SD_CCTL_GROUPS]; int _sd_cctl_groupsz; _sd_net_t _sd_net_config; extern krwlock_t sdbc_queue_lock; unsigned int _sd_node_hint; #define _SD_LRU_Q (&_sd_lru_q) int BLK_FBAS; /* number of FBA's in a cache block */ int CACHE_BLOCK_SIZE; /* size in bytes of a cache block */ int CBLOCKS; _sd_bitmap_t BLK_FBA_BITS; static int sdbc_prefetch_valid_cnt; static int sdbc_prefetch_busy_cnt; static int sdbc_prefetch_trailing; static int sdbc_prefetch_deallocd; static int sdbc_prefetch_pageio1; static int sdbc_prefetch_pageio2; static int sdbc_prefetch_hit; static int sdbc_prefetch_lost; static int _sd_prefetch_opt = 1; /* 0 to disable & use _prefetch_sb_vec[] */ static nsc_vec_t _prefetch_sb_vec[_SD_MAX_BLKS + 1]; _sd_bitmap_t _fba_bits[] = { 0x0000, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff, #if defined(_SD_8K_BLKSIZE) 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff, #endif }; static int _sd_ccsync_cnt = 256; static _sd_cctl_sync_t *_sd_ccent_sync; nsc_io_t *sdbc_io; #ifdef _MULTI_DATAMODEL _sd_stats32_t *_sd_cache_stats32 = NULL; #endif #ifdef DEBUG int cmn_level = CE_PANIC; #else int cmn_level = CE_WARN; #endif /* * Forward declare all statics that are used before defined to enforce * parameter checking * Some (if not all) of these could be removed if the code were reordered */ static void _sdbc_stats_deconfigure(void); static int _sdbc_stats_configure(int cblocks); static int _sdbc_lruq_configure(_sd_queue_t *); static void _sdbc_lruq_deconfigure(void); static int _sdbc_mem_configure(int cblocks, spcs_s_info_t kstatus); static void _sdbc_mem_deconfigure(int cblocks); static void _sd_ins_queue(_sd_queue_t *, _sd_cctl_t *centry); static int _sd_flush_cd(int cd); static int _sd_check_buffer_alloc(int cd, nsc_off_t fba_pos, nsc_size_t fba_len, _sd_buf_handle_t **hp); static int _sd_doread(_sd_buf_handle_t *handle, _sd_cctl_t *cc_ent, nsc_off_t fba_pos, nsc_size_t fba_len, int flag); static void _sd_async_read_ea(blind_t xhandle, nsc_off_t fba_pos, nsc_size_t fba_len, int error); static void _sd_async_write_ea(blind_t xhandle, nsc_off_t fba_pos, nsc_size_t fba_len, int error); static void _sd_queue_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len); static int _sd_remote_store(_sd_cctl_t *cc_ent, nsc_off_t fba_pos, nsc_size_t fba_len); static int _sd_copy_direct(_sd_buf_handle_t *handle1, _sd_buf_handle_t *handle2, nsc_off_t fba_pos1, nsc_off_t fba_pos2, nsc_size_t fba_len); static int _sd_sync_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len, int flag); static int _sd_sync_write2(_sd_buf_handle_t *wr_handle, nsc_off_t wr_st_pos, nsc_size_t fba_len, int flag, _sd_buf_handle_t *rd_handle, nsc_off_t rd_st_pos); static int sdbc_fd_attach_cd(blind_t xcd); static int sdbc_fd_detach_cd(blind_t xcd); static int sdbc_fd_flush_cd(blind_t xcd); static int _sdbc_gl_centry_configure(spcs_s_info_t); static int _sdbc_gl_file_configure(spcs_s_info_t); static void _sdbc_gl_centry_deconfigure(void); static void _sdbc_gl_file_deconfigure(void); static int sdbc_doread_prefetch(_sd_cctl_t *cc_ent, nsc_off_t fba_pos, nsc_size_t fba_len); static _sd_bitmap_t update_dirty(_sd_cctl_t *cc_ent, sdbc_cblk_fba_t st_off, sdbc_cblk_fba_t st_len); static int _sd_prefetch_buf(int cd, nsc_off_t fba_pos, nsc_size_t fba_len, int flag, _sd_buf_handle_t *handle, int locked); /* dynmem support */ static int _sd_setup_category_on_type(_sd_cctl_t *header); static int _sd_setup_mem_chaining(_sd_cctl_t *header, int flag); static int sdbc_check_cctl_cot(_sd_cctl_t *); static int sdbc_dmqueues_configure(); static void sdbc_dmqueues_deconfigure(); static _sd_cctl_t *sdbc_get_dmchain(int, int *, int); static int sdbc_dmchain_avail(_sd_cctl_t *); void sdbc_requeue_dmchain(_sd_queue_t *, _sd_cctl_t *, int, int); static void sdbc_ins_dmqueue_back(_sd_queue_t *, _sd_cctl_t *); void sdbc_ins_dmqueue_front(_sd_queue_t *, _sd_cctl_t *); void sdbc_remq_dmchain(_sd_queue_t *, _sd_cctl_t *); static void sdbc_clear_dmchain(_sd_cctl_t *, _sd_cctl_t *); void sdbc_requeue_head_dm_try(_sd_cctl_t *); static _sd_cctl_t *sdbc_alloc_dmc(int, nsc_off_t, nsc_size_t, int *, sdbc_allocbuf_t *, int); static _sd_cctl_t *sdbc_alloc_lru(int, nsc_off_t, int *, int); static _sd_cctl_t *sdbc_alloc_from_dmchain(int, nsc_off_t, sdbc_allocbuf_t *, int); static void sdbc_centry_init_dm(_sd_cctl_t *); static int sdbc_centry_memalloc_dm(_sd_cctl_t *, int, int); static void sdbc_centry_alloc_end(sdbc_allocbuf_t *); /* _SD_DEBUG */ #if defined(_SD_DEBUG) || defined(DEBUG) static int _sd_cctl_valid(_sd_cctl_t *); #endif static nsc_def_t _sdbc_fd_def[] = { "Attach", (uintptr_t)sdbc_fd_attach_cd, 0, "Detach", (uintptr_t)sdbc_fd_detach_cd, 0, "Flush", (uintptr_t)sdbc_fd_flush_cd, 0, 0, 0, 0 }; /* * _sdbc_cache_configure - initialize cache blocks, queues etc. * * ARGUMENTS: * cblocks - Number of cache blocks * * RETURNS: * 0 on success. * SDBC_EENABLEFAIL or SDBC_EMEMCONFIG on failure. * */ int _sdbc_cache_configure(int cblocks, spcs_s_info_t kstatus) { CBLOCKS = cblocks; _sd_cache_files = (_sd_cd_info_t *) kmem_zalloc(sdbc_max_devs * sizeof (_sd_cd_info_t), KM_SLEEP); if (_sdbc_stats_configure(cblocks)) return (SDBC_EENABLEFAIL); if (sdbc_use_dmchain) { if (sdbc_dmqueues_configure()) return (SDBC_EENABLEFAIL); } else { if (_sdbc_lruq_configure(_SD_LRU_Q)) return (SDBC_EENABLEFAIL); } if (_sdbc_mem_configure(cblocks, kstatus)) return (SDBC_EMEMCONFIG); CACHE_BLOCK_SIZE = BLK_SIZE(1); BLK_FBAS = FBA_NUM(CACHE_BLOCK_SIZE); BLK_FBA_BITS = _fba_bits[BLK_FBAS]; sdbc_allocb_pageio1 = 0; sdbc_allocb_pageio2 = 0; sdbc_allocb_hit = 0; sdbc_allocb_inuse = 0; sdbc_allocb_lost = 0; sdbc_centry_inuse = 0; sdbc_centry_lost = 0; sdbc_centry_hit = 0; sdbc_centry_deallocd = 0; sdbc_dmchain_not_avail = 0; sdbc_allocb_deallocd = 0; sdbc_prefetch_valid_cnt = 0; sdbc_prefetch_busy_cnt = 0; sdbc_prefetch_trailing = 0; sdbc_prefetch_deallocd = 0; sdbc_prefetch_pageio1 = 0; sdbc_prefetch_pageio2 = 0; sdbc_prefetch_hit = 0; sdbc_prefetch_lost = 0; sdbc_check_cot = 0; sdbc_prefetch1 = 1; sdbc_ra_hash = 0; sdbc_ra_none = 0; return (0); } /* * _sdbc_cache_deconfigure - cache is being deconfigured. Release any * memory that we acquired during the configuration process and return * to the unconfigured state. * * NOTE: all users of the cache should be inactive at this point, * i.e. we are unregistered from sd and all cache daemons/threads are * gone. * */ void _sdbc_cache_deconfigure(void) { /* CCIO shutdown must happen before memory is free'd */ if (_sd_cache_files) { kmem_free(_sd_cache_files, sdbc_max_devs * sizeof (_sd_cd_info_t)); _sd_cache_files = (_sd_cd_info_t *)NULL; } BLK_FBA_BITS = 0; BLK_FBAS = 0; CACHE_BLOCK_SIZE = 0; _sdbc_mem_deconfigure(CBLOCKS); _sdbc_gl_centry_deconfigure(); _sdbc_gl_file_deconfigure(); if (sdbc_use_dmchain) sdbc_dmqueues_deconfigure(); else _sdbc_lruq_deconfigure(); _sdbc_stats_deconfigure(); CBLOCKS = 0; } /* * _sdbc_stats_deconfigure - cache is being deconfigured turn off * stats. This could seemingly do more but we leave most of the * data intact until cache is configured again. * */ static void _sdbc_stats_deconfigure(void) { int i; #ifdef DEBUG if (sdbc_dynmem_kstat_dm) { kstat_delete(sdbc_dynmem_kstat_dm); sdbc_dynmem_kstat_dm = NULL; } #endif if (sdbc_global_stats_kstat) { kstat_delete(sdbc_global_stats_kstat); sdbc_global_stats_kstat = NULL; } if (sdbc_cd_kstats) { for (i = 0; i < sdbc_max_devs; i++) { if (sdbc_cd_kstats[i]) { kstat_delete(sdbc_cd_kstats[i]); sdbc_cd_kstats[i] = NULL; } } kmem_free(sdbc_cd_kstats, sizeof (kstat_t *) * sdbc_max_devs); sdbc_cd_kstats = NULL; } if (sdbc_global_io_kstat) { kstat_delete(sdbc_global_io_kstat); mutex_destroy(&sdbc_global_io_kstat_mutex); sdbc_global_io_kstat = NULL; } if (sdbc_cd_io_kstats) { for (i = 0; i < sdbc_max_devs; i++) { if (sdbc_cd_io_kstats[i]) { kstat_delete(sdbc_cd_io_kstats[i]); sdbc_cd_io_kstats[i] = NULL; } } kmem_free(sdbc_cd_io_kstats, sizeof (kstat_t *) * sdbc_max_devs); sdbc_cd_io_kstats = NULL; } if (sdbc_cd_io_kstats_mutexes) { /* mutexes are already destroyed in cd_kstat_remove() */ kmem_free(sdbc_cd_io_kstats_mutexes, sizeof (kmutex_t) * sdbc_max_devs); sdbc_cd_io_kstats_mutexes = NULL; } if (_sd_cache_stats) { kmem_free(_sd_cache_stats, sizeof (_sd_stats_t) + (sdbc_max_devs - 1) * sizeof (_sd_shared_t)); _sd_cache_stats = NULL; } #ifdef _MULTI_DATAMODEL if (_sd_cache_stats32) { kmem_free(_sd_cache_stats32, sizeof (_sd_stats32_t) + (sdbc_max_devs - 1) * sizeof (_sd_shared_t)); _sd_cache_stats32 = NULL; } #endif } static int _sdbc_stats_configure(int cblocks) { _sd_cache_stats = kmem_zalloc(sizeof (_sd_stats_t) + (sdbc_max_devs - 1) * sizeof (_sd_shared_t), KM_SLEEP); _sd_cache_stats->st_blksize = (int)BLK_SIZE(1); _sd_cache_stats->st_cachesize = cblocks * BLK_SIZE(1); _sd_cache_stats->st_numblocks = cblocks; _sd_cache_stats->st_wrcancelns = 0; _sd_cache_stats->st_destaged = 0; #ifdef _MULTI_DATAMODEL _sd_cache_stats32 = kmem_zalloc(sizeof (_sd_stats32_t) + (sdbc_max_devs - 1) * sizeof (_sd_shared_t), KM_SLEEP); #endif /* kstat implementation - global stats */ sdbc_global_stats_kstat = kstat_create(SDBC_KSTAT_MODULE, 0, SDBC_KSTAT_GSTATS, SDBC_KSTAT_CLASS, KSTAT_TYPE_NAMED, sizeof (sdbc_global_stats)/sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); if (sdbc_global_stats_kstat != NULL) { sdbc_global_stats_kstat->ks_data = &sdbc_global_stats; sdbc_global_stats_kstat->ks_update = sdbc_global_stats_update; sdbc_global_stats_kstat->ks_private = _sd_cache_stats; kstat_install(sdbc_global_stats_kstat); } else { cmn_err(CE_WARN, "!sdbc: gstats kstat failed"); } /* global I/O kstats */ sdbc_global_io_kstat = kstat_create(SDBC_KSTAT_MODULE, 0, SDBC_IOKSTAT_GSTATS, "disk", KSTAT_TYPE_IO, 1, 0); if (sdbc_global_io_kstat) { mutex_init(&sdbc_global_io_kstat_mutex, NULL, MUTEX_DRIVER, NULL); sdbc_global_io_kstat->ks_lock = &sdbc_global_io_kstat_mutex; kstat_install(sdbc_global_io_kstat); } /* * kstat implementation - cd stats * NOTE: one kstat instance for each open cache descriptor */ sdbc_cd_kstats = kmem_zalloc(sizeof (kstat_t *) * sdbc_max_devs, KM_SLEEP); /* * kstat implementation - i/o kstats per cache descriptor * NOTE: one I/O kstat instance for each cd */ sdbc_cd_io_kstats = kmem_zalloc(sizeof (kstat_t *) * sdbc_max_devs, KM_SLEEP); sdbc_cd_io_kstats_mutexes = kmem_zalloc(sizeof (kmutex_t) * sdbc_max_devs, KM_SLEEP); #ifdef DEBUG /* kstat implementation - dynamic memory stats */ sdbc_dynmem_kstat_dm = kstat_create(SDBC_KSTAT_MODULE, 0, SDBC_KSTAT_DYNMEM, SDBC_KSTAT_CLASS, KSTAT_TYPE_NAMED, sizeof (sdbc_dynmem_dm)/sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); if (sdbc_dynmem_kstat_dm != NULL) { sdbc_dynmem_kstat_dm->ks_data = &sdbc_dynmem_dm; sdbc_dynmem_kstat_dm->ks_update = sdbc_dynmem_kstat_update_dm; sdbc_dynmem_kstat_dm->ks_private = &dynmem_processing_dm; kstat_install(sdbc_dynmem_kstat_dm); } else { cmn_err(CE_WARN, "!sdbc: dynmem kstat failed"); } #endif return (0); } /* * sdbc_dmqueues_configure() * initialize the queues of dynamic memory chains. */ _sd_queue_t *sdbc_dm_queues; static int max_dm_queues; static int sdbc_dmqueues_configure() { int i; /* * CAUTION! this code depends on max_dyn_list not changing * if it does change behavior may be incorrect, as cc_alloc_size_dm * depends on max_dyn_list and indexes to dmqueues are derived from * cc_alloc_size_dm. * see _sd_setup_category_on_type() and _sd_dealloc_dm() * TODO: prevent max_dyn_list from on-the-fly modification (easy) or * allow for on-the-fly changes to number of dm queues (hard). */ max_dm_queues = dynmem_processing_dm.max_dyn_list; ++max_dm_queues; /* need a "0" queue for centrys with no memory */ sdbc_dm_queues = (_sd_queue_t *) kmem_zalloc(max_dm_queues * sizeof (_sd_queue_t), KM_SLEEP); #ifdef DEBUG dmchainpull_table = (int *)kmem_zalloc(max_dm_queues * max_dm_queues * sizeof (int), KM_SLEEP); #endif for (i = 0; i < max_dm_queues; ++i) { (void) _sdbc_lruq_configure(&sdbc_dm_queues[i]); sdbc_dm_queues[i].sq_dmchain_cblocks = i; } return (0); } static void sdbc_dmqueues_deconfigure() { /* CAUTION! this code depends on max_dyn_list not changing */ if (sdbc_dm_queues) kmem_free(sdbc_dm_queues, max_dm_queues * sizeof (_sd_queue_t)); sdbc_dm_queues = NULL; max_dm_queues = 0; } #define GOOD_LRUSIZE(q) ((q->sq_inq >= 0) || (q->sq_inq <= CBLOCKS)) /* * _sdbc_lruq_configure - initialize the lru queue * * ARGUMENTS: NONE * RETURNS: 0 * */ static int _sdbc_lruq_configure(_sd_queue_t *_sd_lru) { _sd_lru->sq_inq = 0; mutex_init(&_sd_lru->sq_qlock, NULL, MUTEX_DRIVER, NULL); _sd_lru->sq_qhead.cc_next = _sd_lru->sq_qhead.cc_prev = &(_sd_lru->sq_qhead); return (0); } /* * _sdbc_lruq_deconfigure - deconfigure the lru queue * * ARGUMENTS: NONE * */ static void _sdbc_lruq_deconfigure(void) { _sd_queue_t *_sd_lru; _sd_lru = _SD_LRU_Q; mutex_destroy(&_sd_lru->sq_qlock); bzero(_sd_lru, sizeof (_sd_queue_t)); } /* * _sdbc_mem_configure - initialize the cache memory. * Create and initialize the hash table. * Create cache control blocks and fill them with relevent * information and enqueue onto the lru queue. * Initialize the Write control blocks (blocks that contain * information as to where the data will be mirrored) * Initialize the Fault tolerant blocks (blocks that contain * information about the mirror nodes dirty writes) * * ARGUMENTS: * cblocks - Number of cache blocks. * RETURNS: 0 * */ static int _sdbc_mem_configure(int cblocks, spcs_s_info_t kstatus) { int num_blks, i, blk; _sd_cctl_t *centry; _sd_net_t *netc; _sd_cctl_t *prev_entry_dm, *first_entry_dm; if ((_sd_htable = _sdbc_hash_configure(cblocks)) == NULL) { spcs_s_add(kstatus, SDBC_ENOHASH); return (-1); } _sd_cctl_groupsz = (cblocks / _SD_CCTL_GROUPS) + ((cblocks % _SD_CCTL_GROUPS) != 0); for (i = 0; i < _SD_CCTL_GROUPS; i++) { _sd_cctl[i] = (_sd_cctl_t *) nsc_kmem_zalloc(_sd_cctl_groupsz * sizeof (_sd_cctl_t), KM_SLEEP, sdbc_cache_mem); if (_sd_cctl[i] == NULL) { spcs_s_add(kstatus, SDBC_ENOCB); return (-1); } } _sd_ccent_sync = (_sd_cctl_sync_t *) nsc_kmem_zalloc(_sd_ccsync_cnt * sizeof (_sd_cctl_sync_t), KM_SLEEP, sdbc_local_mem); if (_sd_ccent_sync == NULL) { spcs_s_add(kstatus, SDBC_ENOCCTL); return (-1); } for (i = 0; i < _sd_ccsync_cnt; i++) { mutex_init(&_sd_ccent_sync[i]._cc_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&_sd_ccent_sync[i]._cc_blkcv, NULL, CV_DRIVER, NULL); } blk = 0; netc = &_sd_net_config; num_blks = (netc->sn_cpages * (int)netc->sn_psize)/BLK_SIZE(1); prev_entry_dm = 0; first_entry_dm = 0; for (i = 0; i < num_blks; i++, blk++) { centry = _sd_cctl[(blk/_sd_cctl_groupsz)] + (blk%_sd_cctl_groupsz); centry->cc_sync = &_sd_ccent_sync[blk % _sd_ccsync_cnt]; centry->cc_next = centry->cc_prev = NULL; centry->cc_dirty_next = centry->cc_dirty_link = NULL; centry->cc_await_use = centry->cc_await_page = 0; centry->cc_inuse = centry->cc_pageio = 0; centry->cc_flag = 0; centry->cc_iocount = 0; centry->cc_valid = 0; if (!first_entry_dm) first_entry_dm = centry; if (prev_entry_dm) prev_entry_dm->cc_link_list_dm = centry; prev_entry_dm = centry; centry->cc_link_list_dm = first_entry_dm; centry->cc_data = 0; centry->cc_write = NULL; centry->cc_dirty = 0; { _sd_queue_t *q; if (sdbc_use_dmchain) { q = &sdbc_dm_queues[0]; centry->cc_cblocks = 0; } else q = _SD_LRU_Q; _sd_ins_queue(q, centry); } } if (_sdbc_gl_centry_configure(kstatus) != 0) return (-1); if (_sdbc_gl_file_configure(kstatus) != 0) return (-1); return (0); } /* * _sdbc_gl_file_configure() * allocate and initialize space for the global filename data. * */ static int _sdbc_gl_file_configure(spcs_s_info_t kstatus) { ss_voldata_t *fileinfo; ss_voldata_t tempfinfo; ss_vdir_t vdir; ss_vdirkey_t key; int err = 0; _sdbc_gl_file_info_size = safestore_config.ssc_maxfiles * sizeof (ss_voldata_t); if ((_sdbc_gl_file_info = kmem_zalloc(_sdbc_gl_file_info_size, KM_NOSLEEP)) == NULL) { spcs_s_add(kstatus, SDBC_ENOSFNV); return (-1); } /* setup the key to get a directory stream of all volumes */ key.vk_type = CDIR_ALL; fileinfo = _sdbc_gl_file_info; /* * if coming up after a crash, "refresh" the host * memory copy from safestore. */ if (_sdbc_warm_start()) { if (SSOP_GETVDIR(sdbc_safestore, &key, &vdir)) { cmn_err(CE_WARN, "!sdbc(_sdbc_gl_file_configure): " "cannot read safestore"); return (-1); } /* * cycle through the vdir getting volume data * and volume tokens */ while ((err = SSOP_GETVDIRENT(sdbc_safestore, &vdir, fileinfo)) == SS_OK) { ++fileinfo; } if (err != SS_EOF) { /* * fail to configure since * recovery is not possible. */ spcs_s_add(kstatus, SDBC_ENOREFRESH); return (-1); } } else { /* normal initialization, not a warm start */ /* * if this fails, continue: cache will start * in writethru mode */ if (SSOP_GETVDIR(sdbc_safestore, &key, &vdir)) { cmn_err(CE_WARN, "!sdbc(_sdbc_gl_file_configure): " "cannot read safestore"); return (-1); } /* * cycle through the vdir getting just the volume tokens * and initializing volume entries */ while ((err = SSOP_GETVDIRENT(sdbc_safestore, &vdir, &tempfinfo)) == 0) { /* * initialize the host memory copy of the * global file region. this means setting the * _pinned and _attached fields to _SD_NO_HOST * because the default of zero conflicts with * the min nodeid of zero. */ fileinfo->sv_vol = tempfinfo.sv_vol; fileinfo->sv_pinned = _SD_NO_HOST; fileinfo->sv_attached = _SD_NO_HOST; fileinfo->sv_cd = _SD_NO_CD; /* initialize the directory entry */ if ((err = SSOP_SETVOL(sdbc_safestore, fileinfo)) == SS_ERR) { cmn_err(CE_WARN, "!sdbc(_sdbc_gl_file_configure): " "volume entry write failure %p", (void *)fileinfo->sv_vol); break; } ++fileinfo; } /* coming up clean, continue in w-t mode */ if (err != SS_EOF) cmn_err(CE_WARN, "!sdbc(_sdbc_gl_file_configure) " "unable to init safe store volinfo"); } return (0); } static void _sdbc_gl_centry_deconfigure(void) { if (_sdbc_gl_centry_info) kmem_free(_sdbc_gl_centry_info, _sdbc_gl_centry_info_size); _sdbc_gl_centry_info = NULL; _sdbc_gl_centry_info_size = 0; } static int _sdbc_gl_centry_configure(spcs_s_info_t kstatus) { int wblocks; ss_centry_info_t *cinfo; ss_cdirkey_t key; ss_cdir_t cdir; int err = 0; wblocks = safestore_config.ssc_wsize / BLK_SIZE(1); _sdbc_gl_centry_info_size = sizeof (ss_centry_info_t) * wblocks; if ((_sdbc_gl_centry_info = kmem_zalloc(_sdbc_gl_centry_info_size, KM_NOSLEEP)) == NULL) { cmn_err(CE_WARN, "!sdbc(_sdbc_gl_centry_configure) " "alloc failed for gl_centry_info region"); _sdbc_gl_centry_deconfigure(); return (-1); } /* * synchronize the centry info area with safe store */ /* setup the key to get a directory stream of all centrys */ key.ck_type = CDIR_ALL; cinfo = _sdbc_gl_centry_info; if (_sdbc_warm_start()) { if (SSOP_GETCDIR(sdbc_safestore, &key, &cdir)) { cmn_err(CE_WARN, "!sdbc(_sdbc_gl_centry_configure): " "cannot read safestore"); return (-1); } /* * cycle through the cdir getting resource * tokens and reading centrys */ while ((err = SSOP_GETCDIRENT(sdbc_safestore, &cdir, cinfo)) == 0) { ++cinfo; } if (err != SS_EOF) { /* * fail to configure since * recovery is not possible. */ _sdbc_gl_centry_deconfigure(); spcs_s_add(kstatus, SDBC_EGLDMAFAIL); return (-1); } } else { if (SSOP_GETCDIR(sdbc_safestore, &key, &cdir)) { cmn_err(CE_WARN, "!sdbc(_sdbc_gl_centry_configure): " "cannot read safestore"); return (-1); } /* * cycle through the cdir getting resource * tokens and initializing centrys */ while ((err = SSOP_GETCDIRENT(sdbc_safestore, &cdir, cinfo)) == 0) { cinfo->sc_cd = -1; cinfo->sc_fpos = -1; if ((err = SSOP_SETCENTRY(sdbc_safestore, cinfo)) == SS_ERR) { cmn_err(CE_WARN, "!sdbc(_sdbc_gl_centry_configure): " "cache entry write failure %p", (void *)cinfo->sc_res); break; } ++cinfo; } /* coming up clean, continue in w-t mode */ if (err != SS_EOF) { cmn_err(CE_WARN, "!sdbc(sdbc_gl_centry_configure) " "_sdbc_gl_centry_info initialization failed"); } } return (0); } static void _sdbc_gl_file_deconfigure(void) { if (_sdbc_gl_file_info) kmem_free(_sdbc_gl_file_info, _sdbc_gl_file_info_size); _sdbc_gl_file_info = NULL; _sdbc_gl_file_info_size = 0; } /* * _sdbc_mem_deconfigure - deconfigure the cache memory. * Release any memory/locks/sv's acquired during _sdbc_mem_configure. * * ARGUMENTS: * cblocks - Number of cache blocks. * */ /* ARGSUSED */ static void _sdbc_mem_deconfigure(int cblocks) { int i; if (_sd_ccent_sync) { for (i = 0; i < _sd_ccsync_cnt; i++) { mutex_destroy(&_sd_ccent_sync[i]._cc_lock); cv_destroy(&_sd_ccent_sync[i]._cc_blkcv); } nsc_kmem_free(_sd_ccent_sync, _sd_ccsync_cnt * sizeof (_sd_cctl_sync_t)); } _sd_ccent_sync = NULL; for (i = 0; i < _SD_CCTL_GROUPS; i++) { if (_sd_cctl[i] != NULL) { nsc_kmem_free(_sd_cctl[i], _sd_cctl_groupsz * sizeof (_sd_cctl_t)); _sd_cctl[i] = NULL; } } _sd_cctl_groupsz = 0; _sdbc_hash_deconfigure(_sd_htable); _sd_htable = NULL; } #if defined(_SD_DEBUG) || defined(DEBUG) static int _sd_cctl_valid(_sd_cctl_t *addr) { _sd_cctl_t *end; int i, valid; valid = 0; for (i = 0; i < _SD_CCTL_GROUPS; i++) { end = _sd_cctl[i] + _sd_cctl_groupsz; if (addr >= _sd_cctl[i] && addr < end) { valid = 1; break; } } return (valid); } #endif /* * _sd_ins_queue - insert centry into LRU queue * (during initialization, locking not required) */ static void _sd_ins_queue(_sd_queue_t *q, _sd_cctl_t *centry) { _sd_cctl_t *q_head; ASSERT(_sd_cctl_valid(centry)); q_head = &q->sq_qhead; centry->cc_prev = q_head; centry->cc_next = q_head->cc_next; q_head->cc_next->cc_prev = centry; q_head->cc_next = centry; q->sq_inq++; ASSERT(GOOD_LRUSIZE(q)); } void _sd_requeue(_sd_cctl_t *centry) { _sd_queue_t *q = _SD_LRU_Q; /* was FAST */ mutex_enter(&q->sq_qlock); #if defined(_SD_DEBUG) if (1) { _sd_cctl_t *cp, *cn, *qp; cp = centry->cc_prev; cn = centry->cc_next; qp = (q->sq_qhead).cc_prev; if (!_sd_cctl_valid(centry) || (cp != &(q->sq_qhead) && !_sd_cctl_valid(cp)) || (cn != &(q->sq_qhead) && !_sd_cctl_valid(cn)) || !_sd_cctl_valid(qp)) cmn_err(CE_PANIC, "_sd_requeue %x prev %x next %x qp %x", centry, cp, cn, qp); } #endif centry->cc_prev->cc_next = centry->cc_next; centry->cc_next->cc_prev = centry->cc_prev; centry->cc_next = &(q->sq_qhead); centry->cc_prev = q->sq_qhead.cc_prev; q->sq_qhead.cc_prev->cc_next = centry; q->sq_qhead.cc_prev = centry; centry->cc_seq = q->sq_seq++; /* was FAST */ mutex_exit(&q->sq_qlock); (q->sq_req_stat)++; } void _sd_requeue_head(_sd_cctl_t *centry) { _sd_queue_t *q = _SD_LRU_Q; /* was FAST */ mutex_enter(&q->sq_qlock); #if defined(_SD_DEBUG) if (1) { _sd_cctl_t *cp, *cn, *qn; cp = centry->cc_prev; cn = centry->cc_next; qn = (q->sq_qhead).cc_prev; if (!_sd_cctl_valid(centry) || (cp != &(q->sq_qhead) && !_sd_cctl_valid(cp)) || (cn != &(q->sq_qhead) && !_sd_cctl_valid(cn)) || !_sd_cctl_valid(qn)) cmn_err(CE_PANIC, "_sd_requeue_head %x prev %x next %x qn %x", centry, cp, cn, qn); } #endif centry->cc_prev->cc_next = centry->cc_next; centry->cc_next->cc_prev = centry->cc_prev; centry->cc_prev = &(q->sq_qhead); centry->cc_next = q->sq_qhead.cc_next; q->sq_qhead.cc_next->cc_prev = centry; q->sq_qhead.cc_next = centry; centry->cc_seq = q->sq_seq++; centry->cc_flag &= ~CC_QHEAD; /* was FAST */ mutex_exit(&q->sq_qlock); } /* * _sd_open - Open a file. * * ARGUMENTS: * filename - Name of the file to be opened. * flag - Flag associated with open. * (currently used to determine a ckd device) * RETURNS: * cd - the cache descriptor. */ int _sd_open(char *filename, int flag) { int cd; if (!_sd_cache_initialized) { cmn_err(CE_WARN, "!sdbc(_sd_open) cache not initialized"); return (-EINVAL); } cd = _sd_open_cd(filename, -1, flag); SDTRACE(SDF_OPEN, (cd < 0) ? SDT_INV_CD : cd, 0, SDT_INV_BL, 0, cd); return (cd); } static int _sd_open_io(char *filename, int flag, blind_t *cdp, nsc_iodev_t *iodev) { _sd_cd_info_t *cdi; int cd; int rc = 0; if ((cd = _sd_open(filename, flag)) >= 0) { cdi = &(_sd_cache_files[cd]); cdi->cd_iodev = iodev; nsc_set_owner(cdi->cd_rawfd, cdi->cd_iodev); *cdp = (blind_t)(unsigned long)cd; } else rc = -cd; return (rc); } int _sd_open_cd(char *filename, const int cd, const int flag) { int new_cd, rc = 0, alloc_cd = -1; ss_voldata_t *cdg; int preexists = 0; _sd_cd_info_t *cdi; int failover_open, open_failed; major_t devmaj; minor_t devmin; if (_sdbc_shutdown_in_progress) return (-EIO); if (strlen(filename) > (NSC_MAXPATH-1)) return (-ENAMETOOLONG); /* * If the cd is >= 0, then this is a open for a specific cd. * This happens when the mirror node crashes, and we attempt to * reopen the files with the same cache descriptors as existed on * the other node */ retry_open: failover_open = 0; open_failed = 0; if (cd >= 0) { failover_open++; cdi = &(_sd_cache_files[cd]); mutex_enter(&_sd_cache_lock); if (cdi->cd_info == NULL) cdi->cd_info = &_sd_cache_stats->st_shared[cd]; else if (cdi->cd_info->sh_alloc && strcmp(cdi->cd_info->sh_filename, filename)) { cmn_err(CE_WARN, "!sdbc(_sd_open_cd) cd %d mismatch", cd); mutex_exit(&_sd_cache_lock); return (-EEXIST); } if (cdi->cd_info->sh_failed != 2) { if (cdi->cd_info->sh_alloc != 0) preexists = 1; else { cdi->cd_info->sh_alloc = CD_ALLOC_IN_PROGRESS; (void) strcpy(cdi->cd_info->sh_filename, filename); if (_sd_cache_stats->st_count < sdbc_max_devs) _sd_cache_stats->st_count++; } } mutex_exit(&_sd_cache_lock); alloc_cd = cd; goto known_cd; } new_cd = 0; mutex_enter(&_sd_cache_lock); for (cdi = &(_sd_cache_files[new_cd]), cdg = _sdbc_gl_file_info + new_cd; new_cd < (sdbc_max_devs); new_cd++, cdi++, cdg++) { if (strlen(cdg->sv_volname) != 0) if (strcmp(cdg->sv_volname, filename)) continue; if (cdi->cd_info == NULL) cdi->cd_info = &_sd_cache_stats->st_shared[new_cd]; if (cdi->cd_info->sh_failed != 2) { if (cdi->cd_info->sh_alloc != 0) preexists = 1; else { if (cd == -2) { mutex_exit(&_sd_cache_lock); return (-1); } cdi->cd_info->sh_alloc = CD_ALLOC_IN_PROGRESS; (void) strcpy(cdi->cd_info->sh_filename, filename); (void) strcpy(cdg->sv_volname, filename); cdg->sv_cd = new_cd; /* update safestore */ SSOP_SETVOL(sdbc_safestore, cdg); if (_sd_cache_stats->st_count < sdbc_max_devs) _sd_cache_stats->st_count++; cdi->cd_flag = 0; } } alloc_cd = new_cd; break; } mutex_exit(&_sd_cache_lock); if (alloc_cd == -1) return (-ENOSPC); known_cd: /* * If preexists: someone else is attempting to open this file as * well. Do only one open, but block everyone else here till the * open is completed. */ if (preexists) { while (cdi->cd_info->sh_alloc == CD_ALLOC_IN_PROGRESS) { delay(drv_usectohz(20000)); } if ((cdi->cd_info->sh_alloc != CD_ALLOCATED)) goto retry_open; return (alloc_cd); } if (!(cdi->cd_rawfd = nsc_open(filename, NSC_SDBC_ID|NSC_DEVICE, _sdbc_fd_def, (blind_t)(unsigned long)alloc_cd, &rc)) || !nsc_getval(cdi->cd_rawfd, "DevMaj", (int *)&devmaj) || !nsc_getval(cdi->cd_rawfd, "DevMin", (int *)&devmin)) { if (cdi->cd_rawfd) { (void) nsc_close(cdi->cd_rawfd); cdi->cd_rawfd = NULL; } /* * take into account that there may be pinned data on a * device that can no longer be opened */ open_failed++; if (!(cdi->cd_info->sh_failed) && !failover_open) { cdi->cd_info->sh_alloc = 0; mutex_enter(&_sd_cache_lock); _sd_cache_stats->st_count--; mutex_exit(&_sd_cache_lock); if (!rc) rc = EIO; return (-rc); } } cdi->cd_strategy = nsc_get_strategy(devmaj); cdi->cd_crdev = makedevice(devmaj, devmin); cdi->cd_desc = alloc_cd; cdi->cd_dirty_head = cdi->cd_dirty_tail = NULL; cdi->cd_io_head = cdi->cd_io_tail = NULL; cdi->cd_hint = 0; #ifdef DEBUG /* put the dev_t in the ioerr_inject_table */ _sdbc_ioj_set_dev(alloc_cd, cdi->cd_crdev); #endif cdi->cd_global = (_sdbc_gl_file_info + alloc_cd); if (open_failed) { cdi->cd_info->sh_failed = 2; } else if (cdi->cd_info->sh_failed != 2) if ((cdi->cd_global->sv_pinned == _SD_SELF_HOST) && !failover_open) cdi->cd_info->sh_failed = 1; else cdi->cd_info->sh_failed = 0; cdi->cd_flag |= flag; mutex_init(&cdi->cd_lock, NULL, MUTEX_DRIVER, NULL); #ifndef _SD_NOTRACE (void) _sdbc_tr_configure(alloc_cd); #endif cdi->cd_info->sh_alloc = CD_ALLOCATED; cdi->cd_global = (_sdbc_gl_file_info + alloc_cd); cdi->cd_info->sh_cd = (unsigned short) alloc_cd; mutex_enter(&_sd_cache_lock); _sd_cache_stats->st_loc_count++; mutex_exit(&_sd_cache_lock); if (cd_kstat_add(alloc_cd) < 0) { cmn_err(CE_WARN, "!Could not create kstats for cache descriptor" " %d", alloc_cd); } return (open_failed ? -EIO : alloc_cd); } /* * _sd_close - Close a cache descriptor. * * ARGUMENTS: * cd - the cache descriptor to be closed. * RETURNS: * 0 on success. * Error otherwise. * * Note: Under Construction. */ int _sd_close(int cd) { int rc; _sd_cd_info_t *cdi = &(_sd_cache_files[cd]); if (!FILE_OPENED(cd)) { rc = EINVAL; goto out; } SDTRACE(ST_ENTER|SDF_CLOSE, cd, 0, SDT_INV_BL, 0, 0); mutex_enter(&_sd_cache_lock); if ((cdi->cd_info->sh_alloc == 0) || (cdi->cd_info->sh_alloc & CD_CLOSE_IN_PROGRESS)) { mutex_exit(&_sd_cache_lock); SDTRACE(ST_EXIT|SDF_CLOSE, cd, 0, SDT_INV_BL, 0, EINVAL); rc = EINVAL; goto out; } cdi->cd_info->sh_alloc |= CD_CLOSE_IN_PROGRESS; mutex_exit(&_sd_cache_lock); /* * _sd_flush_cd() will return -1 for the case where pinned * data is present, but has been transfered to the mirror * node. In this case it is safe to close the device as * though _sd_flush_cd() had returned 0. */ rc = _sd_flush_cd(cd); if (rc == -1) rc = 0; if (rc != 0) { mutex_enter(&_sd_cache_lock); if ((rc == EAGAIN) && (cdi->cd_global->sv_pinned == _SD_NO_HOST)) { cdi->cd_global->sv_pinned = _SD_SELF_HOST; SSOP_SETVOL(sdbc_safestore, cdi->cd_global); } cdi->cd_info->sh_alloc &= ~CD_CLOSE_IN_PROGRESS; mutex_exit(&_sd_cache_lock); SDTRACE(ST_EXIT|SDF_CLOSE, cd, 0, SDT_INV_BL, _SD_CD_WBLK_USED(cd), rc); goto out; } rc = nsc_close(cdi->cd_rawfd); if (rc) { mutex_enter(&_sd_cache_lock); cdi->cd_info->sh_alloc &= ~CD_CLOSE_IN_PROGRESS; mutex_exit(&_sd_cache_lock); SDTRACE(ST_EXIT|SDF_CLOSE, cd, 0, SDT_INV_BL, 0, rc); goto out; } mutex_enter(&_sd_cache_lock); _sd_cache_stats->st_loc_count--; mutex_exit(&_sd_cache_lock); if (cd_kstat_remove(cd) < 0) { cmn_err(CE_WARN, "!Could not remove kstat for cache descriptor " "%d", cd); } cdi->cd_info->sh_alloc = 0; cdi->cd_info->sh_failed = 0; /* cdi->cd_info = NULL; */ cdi->cd_flag = 0; SDTRACE(ST_EXIT|SDF_CLOSE, cd, 0, SDT_INV_BL, 0, NSC_DONE); rc = NSC_DONE; goto out; out: return (rc); } static int _sd_close_io(blind_t xcd) { _sd_cd_info_t *cdi; int cd = (int)(unsigned long)xcd; int rc = 0; if ((rc = _sd_close((int)cd)) == NSC_DONE) { cdi = &(_sd_cache_files[cd]); cdi->cd_iodev = NULL; } return (rc); } /* * _sdbc_remote_store_pinned - reflect pinned/failed blocks for cd * to our remote mirror. Returns count of blocks reflected or -1 on error. * */ int _sdbc_remote_store_pinned(int cd) { int cnt = 0; _sd_cd_info_t *cdi = &(_sd_cache_files[cd]); _sd_cctl_t *cc_ent, *cc_list; ASSERT(cd >= 0); if (cdi->cd_info->sh_failed) { if (cdi->cd_global->sv_pinned == _SD_NO_HOST) { cdi->cd_global->sv_pinned = _SD_SELF_HOST; SSOP_SETVOL(sdbc_safestore, cdi->cd_global); } mutex_enter(&cdi->cd_lock); cc_ent = cc_list = cdi->cd_fail_head; while (cc_ent) { cnt++; /* is this always necessary? jgk */ if (SSOP_WRITE_CBLOCK(sdbc_safestore, cc_ent->cc_write->sc_res, cc_ent->cc_data, CACHE_BLOCK_SIZE, 0)) { mutex_exit(&cdi->cd_lock); return (-1); } /* update the cache block metadata */ CENTRY_SET_FTPOS(cc_ent); cc_ent->cc_write->sc_flag = cc_ent->cc_flag; cc_ent->cc_write->sc_dirty = CENTRY_DIRTY(cc_ent); SSOP_SETCENTRY(sdbc_safestore, cc_ent->cc_write); cc_ent = cc_ent->cc_dirty_next; if (!cc_ent) cc_ent = cc_list = cc_list->cc_dirty_link; } mutex_exit(&cdi->cd_lock); } return (cnt); } /* * _sd_flush_cd() * reflect pinned blocks to mirrored node * wait for dirty blocks to be flushed * returns: * EIO I/O failure, or pinned blocks and no mirror * EAGAIN Hang: count of outstanding writes isn't decreasing * -1 pinned blocks, reflected to mirror * 0 success */ static int _sd_flush_cd(int cd) { int rc; if ((rc = _sd_wait_for_flush(cd)) == 0) return (0); /* * if we timed out simply return otherwise * it must be an i/o type of error */ if (rc == EAGAIN) return (rc); if (_sd_is_mirror_down()) return (EIO); /* already failed, no mirror */ /* flush any pinned/failed blocks to mirror */ if (_sdbc_remote_store_pinned(cd) >= 0) /* * At this point it looks like we have blocks on the * failed list and taking up space on this node but * no longer have responsibility for the blocks. * These blocks will in fact be freed from the cache * and the failed list when the mirror picks them up * from safe storage and then calls _sd_cd_discard_mirror * which will issue an rpc telling us to finish up. * * Should the other node die before sending the rpc then * we are safe with these blocks simply waiting on the * failed list. */ return (-1); else return (rc); } /* * _sdbc_io_attach_cd -- set up for client access to device, reserve raw device * * ARGUMENTS: * cd - the cache descriptor to attach. * * RETURNS: * 0 on success. * Error otherwise. */ int _sdbc_io_attach_cd(blind_t xcd) { int rc = 0; _sd_cd_info_t *cdi; int cd = (int)(unsigned long)xcd; SDTRACE(ST_ENTER|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, 0); if (!_sd_cache_initialized || _sdbc_shutdown_in_progress || !FILE_OPENED(cd)) { SDTRACE(ST_EXIT|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, EINVAL); DTRACE_PROBE(_sdbc_io_attach_cd_end1); return (EINVAL); } cdi = &(_sd_cache_files[cd]); /* * check if disk is failed without raw device open. If it is, * it has to be recovered using _sd_disk_online */ if (cdi->cd_global->sv_pinned == _SD_SELF_HOST) { _sd_print(3, "_sdbc_io_attach_cd: pinned data. returning EINVAL"); DTRACE_PROBE(_sdbc_io_attach_cd_end2); return (EINVAL); } if ((cdi->cd_info == NULL) || (cdi->cd_info->sh_failed)) { DTRACE_PROBE1(_sdbc_io_attach_cd_end3, struct _sd_shared *, cdi->cd_info); return (EINVAL); } #if defined(_SD_FAULT_RES) /* wait for node recovery to finish */ if (_sd_node_recovery) (void) _sd_recovery_wait(); #endif /* this will provoke a sdbc_fd_attach_cd call .. */ rc = nsc_reserve(cdi->cd_rawfd, NSC_MULTI); SDTRACE(ST_EXIT|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, rc); return (rc); } /* * sdbc_fd_attach_cd -- setup cache for access to raw device underlying cd. * This is provoked by some piece of sdbc doing a reserve on the raw device. * * ARGUMENTS: * cd - the cache descriptor to attach. * * RETURNS: * 0 on success. * Error otherwise. */ static int sdbc_fd_attach_cd(blind_t xcd) { int rc = 0; int cd = (int)(unsigned long)xcd; _sd_cd_info_t *cdi; if (!_sd_cache_initialized || !FILE_OPENED(cd)) { SDTRACE(ST_INFO|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, EINVAL); DTRACE_PROBE(sdbc_fd_attach_cd_end1); return (EINVAL); } cdi = &(_sd_cache_files[cd]); #if defined(_SD_FAULT_RES) /* retrieve pinned/failed data */ if (!_sd_node_recovery) { (void) _sd_repin_cd(cd); } #endif rc = nsc_partsize(cdi->cd_rawfd, &cdi->cd_info->sh_filesize); if (rc != 0) { SDTRACE(ST_INFO|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, rc); DTRACE_PROBE(sdbc_fd_attach_cd_end3); return (rc); } cdi->cd_global->sv_attached = _SD_SELF_HOST; SSOP_SETVOL(sdbc_safestore, cdi->cd_global); mutex_enter(&_sd_cache_lock); cdi->cd_info->sh_flag |= CD_ATTACHED; mutex_exit(&_sd_cache_lock); return (0); } /* * _sdbc_io_detach_cd -- release raw device * Called when a cache client is being detached from this cd. * * ARGUMENTS: * cd - the cache descriptor to detach. * RETURNS: * 0 on success. * Error otherwise. */ int _sdbc_io_detach_cd(blind_t xcd) { int cd = (int)(unsigned long)xcd; _sd_cd_info_t *cdi; SDTRACE(ST_ENTER|SDF_DETACH, cd, 0, SDT_INV_BL, 0, 0); if (!_sd_cache_initialized || !FILE_OPENED(cd)) { SDTRACE(ST_EXIT|SDF_DETACH, cd, 0, SDT_INV_BL, 0, EINVAL); DTRACE_PROBE(_sdbc_io_detach_cd_end1); return (EINVAL); } #if defined(_SD_FAULT_RES) if (_sd_node_recovery) (void) _sd_recovery_wait(); #endif /* relinquish responsibility for device */ cdi = &(_sd_cache_files[cd]); if (!(cdi->cd_rawfd) || !nsc_held(cdi->cd_rawfd)) { cmn_err(CE_WARN, "!sdbc(_sdbc_detach_cd)(%d) not attached", cd); SDTRACE(ST_EXIT|SDF_DETACH, cd, 0, SDT_INV_BL, 0, EPROTO); DTRACE_PROBE1(_sdbc_io_detach_cd_end2, nsc_fd_t *, cdi->cd_rawfd); return (EPROTO); } /* this will provoke/allow a call to sdbc_fd_detach_cd */ nsc_release(cdi->cd_rawfd); SDTRACE(ST_EXIT|SDF_DETACH, cd, 0, SDT_INV_BL, 0, 0); return (0); } /* * _sdbc_detach_cd -- flush dirty writes to disk, release raw device * Called when raw device is being detached from this cd. * * ARGUMENTS: * cd - the cache descriptor to detach. * rd_only - non-zero if detach is for read access. * RETURNS: * 0 on success. * Error otherwise. */ static int sdbc_detach_cd(blind_t xcd, int rd_only) { int rc; int cd = (int)(unsigned long)xcd; _sd_cd_info_t *cdi; SDTRACE(ST_INFO|SDF_DETACH, cd, 0, SDT_INV_BL, 0, 0); if (!_sd_cache_initialized || !FILE_OPENED(cd)) { SDTRACE(ST_INFO|SDF_DETACH, cd, 0, SDT_INV_BL, 0, EINVAL); DTRACE_PROBE(sdbc_detach_cd_end1); return (EINVAL); } rc = _sd_flush_cd(cd); if (rc > 0) { SDTRACE(ST_INFO|SDF_DETACH, cd, 0, SDT_INV_BL, 0, rc); DTRACE_PROBE(sdbc_detach_cd_end2); return (rc); } if (!rd_only) { _sd_hash_invalidate_cd(cd); cdi = &(_sd_cache_files[cd]); if (cdi->cd_global->sv_attached == _SD_SELF_HOST) { cdi->cd_global->sv_attached = _SD_NO_HOST; SSOP_SETVOL(sdbc_safestore, cdi->cd_global); } else { cmn_err(CE_WARN, "!sdbc(_sdbc_detach_cd) (%d) attached by node %d", cd, cdi->cd_global->sv_attached); SDTRACE(SDF_DETACH, cd, 0, SDT_INV_BL, 0, EPROTO); DTRACE_PROBE1(sdbc_detach_cd_end3, int, cdi->cd_global->sv_attached); return (EPROTO); } mutex_enter(&_sd_cache_lock); cdi->cd_info->sh_flag &= ~CD_ATTACHED; mutex_exit(&_sd_cache_lock); } SDTRACE(ST_INFO|SDF_DETACH, cd, 0, SDT_INV_BL, 0, 0); return (0); } /* * _sdbc_fd_detach_cd -- flush dirty writes to disk, release raw device * Called when raw device is being detached from this cd. * * ARGUMENTS: * xcd - the cache descriptor to detach. * RETURNS: * 0 on success. * Error otherwise. */ static int sdbc_fd_detach_cd(blind_t xcd) { return (sdbc_detach_cd(xcd, 0)); } /* * sdbc_fd_flush_cd - raw device "xcd" is being detached and needs * flushing. We only need to flush we don't need to hash invalidate * this file. */ static int sdbc_fd_flush_cd(blind_t xcd) { return (sdbc_detach_cd(xcd, 1)); } /* * _sd_get_pinned - re-issue PINNED callbacks for cache device * * ARGUMENTS: * cd - the cache descriptor to reissue pinned calbacks from. * RETURNS: * 0 on success. * Error otherwise. */ int _sd_get_pinned(blind_t xcd) { _sd_cd_info_t *cdi; _sd_cctl_t *cc_list, *cc_ent; int cd = (int)(unsigned long)xcd; cdi = &_sd_cache_files[cd]; if (cd < 0 || cd >= sdbc_max_devs) { DTRACE_PROBE(_sd_get_pinned_end1); return (EINVAL); } if (!FILE_OPENED(cd)) { DTRACE_PROBE(_sd_get_pinned_end2); return (0); } mutex_enter(&cdi->cd_lock); if (!cdi->cd_info->sh_failed) { mutex_exit(&cdi->cd_lock); DTRACE_PROBE(_sd_get_pinned_end3); return (0); } cc_ent = cc_list = cdi->cd_fail_head; while (cc_ent) { if (CENTRY_PINNED(cc_ent)) nsc_pinned_data(cdi->cd_iodev, BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)), BLK_FBAS); cc_ent = cc_ent->cc_dirty_next; if (!cc_ent) cc_ent = cc_list = cc_list->cc_dirty_link; } mutex_exit(&cdi->cd_lock); return (0); } /* * _sd_allocate_buf - allocate a vector of buffers for io. * *This call has been replaced by _sd_alloc_buf* */ _sd_buf_handle_t * _sd_allocate_buf(int cd, nsc_off_t fba_pos, nsc_size_t fba_len, int flag, int *sts) { _sd_buf_handle_t *handle = NULL; *sts = _sd_alloc_buf((blind_t)(unsigned long)cd, fba_pos, fba_len, flag, &handle); if (*sts == NSC_HIT) *sts = NSC_DONE; return (handle); } /* * _sd_prefetch_buf - _sd_alloc_buf w/flag = NSC_RDAHEAD|NSC_RDBUF * no 'bufvec' (data is not read by caller) * skip leading valid or busy entries (data available sooner) * truncate on busy block (to avoid deadlock) * release trailing valid entries, adjust length before starting I/O. */ static int _sd_prefetch_buf(int cd, nsc_off_t fba_pos, nsc_size_t fba_len, int flag, _sd_buf_handle_t *handle, int locked) { _sd_cd_info_t *cdi; nsc_off_t cblk; /* position of temp cache block */ sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */ sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */ sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */ nsc_off_t io_pos; /* offset in FBA's */ nsc_size_t fba_orig_len; int sts, stall; _sd_cctl_t *centry = NULL; _sd_cctl_t *lentry = NULL; _sd_cctl_t *ioent = NULL; _sd_cctl_t *last_ioent = NULL; sdbc_allocbuf_t alloc_tok = {0}; int this_entry_type = 0; nsc_size_t request_blocks = 0; /* number of cache blocks required */ int pageio; handle->bh_flag |= NSC_HACTIVE; ASSERT(cd >= 0); cdi = &_sd_cache_files[cd]; /* prefetch: truncate if req'd */ if (fba_len > sdbc_max_fbas) fba_len = sdbc_max_fbas; if ((fba_pos + fba_len) > cdi->cd_info->sh_filesize) { if (fba_pos >= cdi->cd_info->sh_filesize) { sts = EIO; goto done; } fba_len = cdi->cd_info->sh_filesize - fba_pos; } fba_orig_len = fba_len; _SD_SETUP_HANDLE(handle, cd, fba_pos, fba_len, flag); handle->bh_centry = NULL; cblk = FBA_TO_BLK_NUM(fba_pos); st_cblk_off = BLK_FBA_OFF(fba_pos); st_cblk_len = BLK_FBAS - st_cblk_off; /* * count number of blocks on chain that is required */ if ((nsc_size_t)st_cblk_len >= fba_len) { st_cblk_len = (sdbc_cblk_fba_t)fba_len; end_cblk_len = 0; } else { end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len); } request_blocks = 1; /* at least one */ /* middle piece */ request_blocks += (fba_len - (st_cblk_len + end_cblk_len)) >> BLK_FBA_SHFT; if (end_cblk_len) ++request_blocks; stall = 0; do { pageio = ((flag & NSC_PAGEIO) != 0 || sdbc_pageio_always != 0); cget: if (centry = (_sd_cctl_t *) _sd_hash_search(cd, cblk, _sd_htable)) { try: /* prefetch: skip leading valid blocks */ if ((ioent == NULL) && SDBC_VALID_BITS(st_cblk_off, st_cblk_len, centry)) { skip: sdbc_prefetch_valid_cnt++; --request_blocks; lentry = centry; centry = NULL; cblk++; fba_len -= st_cblk_len; st_cblk_off = 0; st_cblk_len = (sdbc_cblk_fba_t) ((fba_len > (nsc_size_t)BLK_FBAS) ? BLK_FBAS : fba_len); continue; } if (SET_CENTRY_INUSE(centry)) { /* * prefetch: skip leading busy * or truncate at busy block */ if (ioent == NULL) goto skip; sdbc_prefetch_busy_cnt++; fba_orig_len -= fba_len; fba_len = 0; centry = lentry; /* backup */ break; } /* * bug 4529671 * now that we own the centry make sure that * it is still good. it could have been processed * by _sd_dealloc_dm() in the window between * _sd_hash_search() and SET_CENTRY_INUSE(). */ if ((_sd_cctl_t *) _sd_hash_search(cd, cblk, _sd_htable) != centry) { sdbc_prefetch_deallocd++; #ifdef DEBUG cmn_err(CE_WARN, "!prefetch centry %p cd %d cblk %" NSC_SZFMT " fba_len %" NSC_SZFMT " lost to dealloc?! " "cc_data %p", (void *)centry, cd, cblk, fba_orig_len, (void *)centry->cc_data); #endif CLEAR_CENTRY_INUSE(centry); continue; } if (CC_CD_BLK_MATCH(cd, cblk, centry)) { /* * Do pagelist io mutual exclusion * before messing with the centry. */ if (pageio && SET_CENTRY_PAGEIO(centry)) { /* flusher not done with pageio */ /* * prefetch: skip leading busy * or truncate at busy block */ CLEAR_CENTRY_INUSE(centry); if (ioent == NULL) goto skip; sdbc_prefetch_pageio1++; fba_orig_len -= fba_len; fba_len = 0; centry = lentry; /* backup */ break; } sdbc_prefetch_hit++; this_entry_type = HASH_ENTRY_DM; pageio = 0; centry->cc_toflush = 0; centry->cc_hits++; /* this will reset the age flag */ sdbc_centry_init_dm(centry); DTRACE_PROBE1(_sd_prefetch_buf, _sd_cctl_t *, centry); } else { /* block mismatch */ sdbc_prefetch_lost++; CLEAR_CENTRY_INUSE(centry); continue; } } else { centry = sdbc_centry_alloc(cd, cblk, request_blocks, &stall, &alloc_tok, ALLOC_NOWAIT); if (centry == NULL) { /* * prefetch: cache is very busy. just do * the i/o for the blocks already acquired, * if any. */ fba_orig_len -= fba_len; fba_len = 0; /* * if we have a chain of centry's * then back up (set centry to lentry). * if there is no chain (ioent == NULL) * then centry remains NULL. this can occur * if all previous centrys were hash hits * on valid blocks that were processed in * the skip logic above. */ if (ioent) centry = lentry; /* backup */ break; } /* * dmchaining adjustment. * if centry was obtained from the dmchain * then clear local pageio variable because the * centry already has cc_pageio set. */ if (CENTRY_PAGEIO(centry)) pageio = 0; DTRACE_PROBE1(_sd_alloc_buf, _sd_cctl_t *, centry); this_entry_type = ELIGIBLE_ENTRY_DM; if (centry->cc_aging_dm & FOUND_IN_HASH_DM) this_entry_type = HASH_ENTRY_DM; else { if (centry->cc_aging_dm & FOUND_HOLD_OVER_DM) this_entry_type = HOLD_ENTRY_DM; } } centry->cc_chain = NULL; centry->cc_aging_dm &= ~(FOUND_IN_HASH_DM|FOUND_HOLD_OVER_DM); /* * Do pagelist io mutual exclusion now if we did not do * it above. */ if (pageio && SET_CENTRY_PAGEIO(centry)) { /* flusher not done with pageio */ sdbc_prefetch_pageio2++; /* * prefetch: skip leading busy * or truncate at busy block */ CLEAR_CENTRY_INUSE(centry); if (ioent == NULL) goto skip; sdbc_prefetch_busy_cnt++; fba_orig_len -= fba_len; fba_len = 0; centry = lentry; /* backup */ break; } pageio = 0; fba_len -= st_cblk_len; if (ioent == NULL) { if (!SDBC_VALID_BITS(st_cblk_off, st_cblk_len, centry)) { io_pos = BLK_TO_FBA_NUM(cblk) + st_cblk_off; ioent = last_ioent = centry; } else { DATA_LOG(SDF_ALLOC, centry, st_cblk_off, st_cblk_len); DTRACE_PROBE4(_sd_prefetch_buf_data1, uint64_t, (uint64_t)(BLK_TO_FBA_NUM(cblk) + st_cblk_off), int, st_cblk_len, char *, *(int64_t *)(centry->cc_data + FBA_SIZE(st_cblk_off)), char *, *(int64_t *)(centry->cc_data + FBA_SIZE(st_cblk_off + st_cblk_len) - 8)); } handle->bh_centry = centry; st_cblk_off = 0; st_cblk_len = (sdbc_cblk_fba_t) ((fba_len > (nsc_size_t)BLK_FBAS) ? BLK_FBAS : fba_len); } else { if (!SDBC_VALID_BITS(st_cblk_off, st_cblk_len, centry)) last_ioent = centry; else { DTRACE_PROBE4(_sd_prefetch_buf_data2, uint64_t, (uint64_t)(BLK_TO_FBA_NUM(cblk) + st_cblk_off), int, st_cblk_len, char *, *(int64_t *)(centry->cc_data + FBA_SIZE(st_cblk_off)), char *, *(int64_t *)(centry->cc_data + FBA_SIZE(st_cblk_off + st_cblk_len) - 8)); } lentry->cc_chain = centry; if (fba_len < (nsc_size_t)BLK_FBAS) st_cblk_len = (sdbc_cblk_fba_t)fba_len; } lentry = centry; cblk++; /* if this block has a new identity clear prefetch history */ if (this_entry_type != HASH_ENTRY_DM) centry->cc_aging_dm &= ~(PREFETCH_BUF_I | PREFETCH_BUF_E); centry->cc_aging_dm &= ~(ENTRY_FIELD_DM); centry->cc_aging_dm |= this_entry_type | PREFETCH_BUF_E; if (flag & NSC_METADATA) centry->cc_aging_dm |= STICKY_METADATA_DM; --request_blocks; } while (fba_len > 0); if (locked) { rw_exit(&sdbc_queue_lock); locked = 0; } sdbc_centry_alloc_end(&alloc_tok); if (centry) { centry->cc_chain = NULL; if (sts = _sd_setup_category_on_type(handle->bh_centry)) { (void) _sd_free_buf(handle); goto done; } (void) _sd_setup_mem_chaining(handle->bh_centry, 0); } if (ioent) { /* prefetch: trailing valid can be released, adjust len */ if ((centry != last_ioent)) { centry = last_ioent->cc_chain; last_ioent->cc_chain = NULL; while (centry) { lentry = centry->cc_chain; centry->cc_aging_dm &= ~PREFETCH_BUF_E; _sd_centry_release(centry); centry = lentry; sdbc_prefetch_trailing++; } fba_len = (CENTRY_BLK(last_ioent) - CENTRY_BLK(ioent) + 1) * BLK_FBAS - BLK_FBA_OFF(io_pos); fba_orig_len = fba_len + (io_pos - fba_pos); } _SD_DISCONNECT_CALLBACK(handle); sts = _sd_doread(handle, ioent, io_pos, (fba_pos + fba_orig_len - io_pos), flag); if (sts > 0) (void) _sd_free_buf(handle); } else { CACHE_FBA_READ(cd, fba_orig_len); CACHE_READ_HIT; FBA_READ_IO_KSTATS(cd, FBA_SIZE(fba_orig_len)); sts = NSC_HIT; } done: if (locked) rw_exit(&sdbc_queue_lock); return (sts); } /* * _sd_cc_wait - wait for inuse cache block to become available * Usage: * if (SET_CENTRY_INUSE(centry)) { * _sd_cc_wait(cd, blk, centry, CC_INUSE); * goto try_again; * } * -or- * if (SET_CENTRY_PAGEIO(centry)) { * _sd_cc_wait(cd, blk, centry, CC_PAGEIO); * goto try_again; * } */ void _sd_cc_wait(int cd, nsc_off_t cblk, _sd_cctl_t *centry, int flag) { volatile ushort_t *waiters; volatile uchar_t *uflag; if (flag == CC_INUSE) { waiters = &(centry->cc_await_use); uflag = &(CENTRY_INUSE(centry)); } else if (flag == CC_PAGEIO) { waiters = &(centry->cc_await_page); uflag = &(CENTRY_PAGEIO(centry)); } else { /* Oops! */ #ifdef DEBUG cmn_err(CE_WARN, "!_sd_cc_wait: unknown flag value (%x)", flag); #endif return; } mutex_enter(¢ry->cc_lock); if (CC_CD_BLK_MATCH(cd, cblk, centry) && (*uflag) != 0) { (*waiters)++; sd_serialize(); if ((*uflag) != 0) { unsigned stime = nsc_usec(); cv_wait(¢ry->cc_blkcv, ¢ry->cc_lock); (*waiters)--; mutex_exit(¢ry->cc_lock); SDTRACE(ST_INFO|SDF_ENT_GET, cd, 0, BLK_TO_FBA_NUM(cblk), (nsc_usec()-stime), 0); } else { (*waiters)--; mutex_exit(¢ry->cc_lock); } } else mutex_exit(¢ry->cc_lock); } /* * _sd_alloc_buf - Allocate a vector of buffers for io. * * ARGUMENTS: * cd - Cache descriptor (from a previous open) * fba_pos - disk position (512-byte FBAs) * fba_len - length in disk FBAs. * flag - allocation type. Flag is one or more of * NSC_RDBUF, NSC_WRBUF, NSC_NOBLOCK and hints. * NSC_RDAHEAD - prefetch for future read. * handle_p - pointer to a handle pointer. * If the handle pointer is non-null, its used as a * pre-allocated handle. Else a new handle will be allocated * and stored in *handle_p * * RETURNS: * errno if return > 0. * else NSC_HIT or NSC_DONE on success * or NSC_PENDING on io in progress and NSC_NOBLOCK * specified in the flag. * USAGE: * This routine allocates the cache blocks requested and creates a list * of entries for this request. * If NSC_NOBLOCK was not specified, this call could block on read io. * If flag specified NSC_RDBUF and the request is not an entire * hit, an io is initiated. */ int _sd_alloc_buf(blind_t xcd, nsc_off_t fba_pos, nsc_size_t fba_len, int flag, _sd_buf_handle_t **handle_p) { int cd = (int)(unsigned long)xcd; _sd_cd_info_t *cdi; _sd_buf_handle_t *handle; int sts; nsc_off_t st_cblk, cblk; /* position of start and temp cache block */ sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */ sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */ sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */ nsc_off_t io_pos; /* offset in FBA's */ _sd_bufvec_t *bufvec; _sd_cctl_t *centry, *lentry, *ioent = NULL; nsc_size_t fba_orig_len = fba_len; /* FBA length of orig request */ int stall, pageio; unsigned char cc_flag; int this_entry_type; int locked = 0; nsc_size_t dmchain_request_blocks; /* size of dmchain in cache blocks */ sdbc_allocbuf_t alloc_tok = {0}; int min_frag = 0; /* frag statistics */ int max_frag = 0; /* frag statistics */ int nfrags = 0; /* frag statistics */ #ifdef DEBUG int err = 0; #endif ASSERT(*handle_p != NULL); handle = *handle_p; if (_sdbc_shutdown_in_progress) return (EIO); if (xcd == NSC_ANON_CD) cd = _CD_NOHASH; KSTAT_RUNQ_ENTER(cd); /* * Force large writes on nvram systems to be write-through to * avoid the (slow) bcopy into nvram. */ if (flag & NSC_WRBUF) { if (fba_len > (nsc_size_t)sdbc_wrthru_len) { flag |= NSC_WRTHRU; } } #ifdef DEBUG if (sdbc_pageio_debug != SDBC_PAGEIO_OFF) { switch (sdbc_pageio_debug) { case SDBC_PAGEIO_RDEV: if (cd != _CD_NOHASH && sdbc_pageio_rdev != (dev_t)-1 && _sd_cache_files[cd].cd_crdev == sdbc_pageio_rdev) flag |= NSC_PAGEIO; break; case SDBC_PAGEIO_RAND: if ((nsc_lbolt() % 3) == 0) flag |= NSC_PAGEIO; break; case SDBC_PAGEIO_ALL: flag |= NSC_PAGEIO; break; } } #endif /* DEBUG */ if (fba_len > (nsc_size_t)BLK_FBAS) { rw_enter(&sdbc_queue_lock, RW_WRITER); locked = 1; } /* * _CD_NOHASH: client wants temporary (not hashed) cache memory * not associated with a local disk. Skip local disk checks. */ if (cd == _CD_NOHASH) { flag &= ~(NSC_RDBUF | NSC_WRBUF | NSC_RDAHEAD); handle = *handle_p; handle->bh_flag |= NSC_HACTIVE; goto setup; } SDTRACE(ST_ENTER|SDF_ALLOCBUF, cd, fba_len, fba_pos, flag, 0); if ((flag & NSC_RDAHEAD) && _sd_prefetch_opt) { sts = _sd_prefetch_buf(cd, fba_pos, fba_len, flag, handle, locked); goto done; } #if !defined(_SD_NOCHECKS) if (flag & NSC_RDAHEAD) { /* _sd_prefetch_opt == 0 */ nsc_size_t file_size; /* file_size in FBA's */ /* prefetch: truncate if req'd */ if (fba_len > sdbc_max_fbas) fba_len = sdbc_max_fbas; file_size = _sd_cache_files[(cd)].cd_info->sh_filesize; if ((fba_pos + fba_len) > file_size) { fba_len = file_size - fba_pos; #ifdef NSC_MULTI_TERABYTE if ((int64_t)fba_len <= 0) { #else if ((int32_t)fba_len <= 0) { #endif sts = EIO; SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len, fba_pos, flag, sts); goto done; } } } else if (sts = _sd_check_buffer_alloc(cd, fba_pos, fba_len, handle_p)) { SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len, fba_pos, flag, sts); goto done; } #endif if (fba_len == 0) { SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len, fba_pos, flag, EINVAL); sts = EINVAL; goto done; } handle->bh_flag |= NSC_HACTIVE; cdi = &_sd_cache_files[cd]; if (cdi->cd_recovering) { /* * If recovering this device, then block all allocates * for reading or writing. If we allow reads then * this path could see old data before we recover. * If we allow writes then new data could be overwritten * by old data. * This is clearly still not a complete solution as * the thread doing this allocate could conceivably be * by this point (and in _sd_write/_sd_read for that matter * which don't even have this protection). But this type * of path seems to only exist in a failover situation * where a device has failed on the other node and works * on this node so the problem is not a huge one but exists * never the less. */ if (sts = _sd_recovery_wblk_wait(cd)) { handle->bh_flag &= ~NSC_HACTIVE; SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len, fba_pos, flag, sts); goto done; } } /* write & disk failed, return error immediately */ if ((flag & NSC_WRBUF) && cdi->cd_info->sh_failed) { handle->bh_flag &= ~NSC_HACTIVE; SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len, fba_pos, flag, EIO); sts = EIO; goto done; } setup: _SD_SETUP_HANDLE(handle, cd, fba_pos, fba_len, flag); handle->bh_centry = NULL; bufvec = handle->bh_bufvec; if (flag & NSC_RDAHEAD) { /* _sd_prefetch_opt == 0 */ /* CKD prefetch: bufvec not req'd, use placeholder */ bufvec->bufaddr = NULL; bufvec->bufvmeaddr = NULL; bufvec->buflen = 0; bufvec = _prefetch_sb_vec; } st_cblk = FBA_TO_BLK_NUM(fba_pos); st_cblk_off = BLK_FBA_OFF(fba_pos); st_cblk_len = BLK_FBAS - st_cblk_off; if ((nsc_size_t)st_cblk_len >= fba_len) { end_cblk_len = 0; st_cblk_len = (sdbc_cblk_fba_t)fba_len; } else end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len); cblk = st_cblk; /* * count number of blocks on chain that is required */ /* middle piece */ dmchain_request_blocks = (fba_len - (st_cblk_len + end_cblk_len)) >> BLK_FBA_SHFT; /* start piece */ ++dmchain_request_blocks; /* end piece */ if (end_cblk_len) ++dmchain_request_blocks; cc_flag = 0; if ((handle->bh_flag & NSC_PINNABLE) && (handle->bh_flag & NSC_WRBUF)) cc_flag |= CC_PINNABLE; if (handle->bh_flag & (NSC_NOCACHE|NSC_SEQ_IO)) cc_flag |= CC_QHEAD; lentry = NULL; stall = 0; do { pageio = ((flag & NSC_PAGEIO) != 0 || sdbc_pageio_always != 0); cget: if ((centry = (_sd_cctl_t *) _sd_hash_search(cd, cblk, _sd_htable)) != 0) { if (SET_CENTRY_INUSE(centry)) { /* already inuse: wait for block, retry */ sdbc_allocb_inuse++; if (locked) rw_exit(&sdbc_queue_lock); _sd_cc_wait(cd, cblk, centry, CC_INUSE); if (locked) rw_enter(&sdbc_queue_lock, RW_WRITER); goto cget; } /* * bug 4529671 * now that we own the centry make sure that * it is still good. it could have been processed * by _sd_dealloc_dm() in the window between * _sd_hash_search() and SET_CENTRY_INUSE(). */ if ((_sd_cctl_t *) _sd_hash_search(cd, cblk, _sd_htable) != centry) { sdbc_allocb_deallocd++; #ifdef DEBUG cmn_err(CE_WARN, "!centry %p cd %d cblk %" NSC_SZFMT " fba_len %" NSC_SZFMT " lost to dealloc?! " "cc_data %p", (void *)centry, cd, cblk, fba_orig_len, (void *)centry->cc_data); #endif CLEAR_CENTRY_INUSE(centry); goto cget; } if (CC_CD_BLK_MATCH(cd, cblk, centry)) { /* * Do pagelist io mutual exclusion * before messing with the centry. */ if (pageio && SET_CENTRY_PAGEIO(centry)) { /* wait for flusher to finish pageio */ sdbc_allocb_pageio1++; CLEAR_CENTRY_INUSE(centry); if (locked) rw_exit(&sdbc_queue_lock); _sd_cc_wait(cd, cblk, centry, CC_PAGEIO); if (locked) rw_enter(&sdbc_queue_lock, RW_WRITER); goto cget; } sdbc_allocb_hit++; this_entry_type = HASH_ENTRY_DM; pageio = 0; centry->cc_toflush = 0; centry->cc_hits++; /* this will reset the age flag */ sdbc_centry_init_dm(centry); DTRACE_PROBE1(_sd_alloc_buf1, _sd_cctl_t *, centry); } else { /* block mismatch: release, alloc new block */ sdbc_allocb_lost++; CLEAR_CENTRY_INUSE(centry); goto cget; } } else { centry = sdbc_centry_alloc(cd, cblk, dmchain_request_blocks, &stall, &alloc_tok, locked ? ALLOC_LOCKED : 0); /* * dmchaining adjustment. * if centry was obtained from the dmchain * then clear local pageio variable because the * centry already has cc_pageio set. */ if (CENTRY_PAGEIO(centry)) pageio = 0; DTRACE_PROBE1(_sd_alloc_buf2, _sd_cctl_t *, centry); this_entry_type = ELIGIBLE_ENTRY_DM; if (centry->cc_aging_dm & FOUND_IN_HASH_DM) this_entry_type = HASH_ENTRY_DM; else { if (centry->cc_aging_dm & FOUND_HOLD_OVER_DM) this_entry_type = HOLD_ENTRY_DM; } } centry->cc_aging_dm &= ~(FOUND_IN_HASH_DM|FOUND_HOLD_OVER_DM); /* * Do pagelist io mutual exclusion now if we did not do * it above. */ if (pageio && SET_CENTRY_PAGEIO(centry)) { /* wait for flusher to finish pageio */ sdbc_allocb_pageio2++; CLEAR_CENTRY_INUSE(centry); if (locked) rw_exit(&sdbc_queue_lock); _sd_cc_wait(cd, cblk, centry, CC_PAGEIO); if (locked) rw_enter(&sdbc_queue_lock, RW_WRITER); goto cget; } pageio = 0; if (CENTRY_DIRTY(centry)) { /* * end action might set PEND_DIRTY flag * must lock if need to change flag bits */ if (centry->cc_flag != (centry->cc_flag | cc_flag)) { /* was FAST */ mutex_enter(¢ry->cc_lock); centry->cc_flag |= cc_flag; /* was FAST */ mutex_exit(¢ry->cc_lock); } } else centry->cc_flag |= cc_flag; centry->cc_chain = NULL; /* * step 0:check valid bits in each cache ele as * the chain grows - set ioent/io_pos to first * instance of invalid data */ if (cblk == st_cblk) { handle->bh_centry = centry; fba_len -= st_cblk_len; lentry = centry; if (flag & NSC_RDBUF) { if (!SDBC_VALID_BITS(st_cblk_off, st_cblk_len, centry)) { io_pos = fba_pos; ioent = centry; } else { DATA_LOG(SDF_ALLOC, centry, st_cblk_off, st_cblk_len); DTRACE_PROBE4(_sd_alloc_data1, uint64_t, (uint64_t) (BLK_TO_FBA_NUM(cblk) + st_cblk_off), int, st_cblk_len, char *, *(int64_t *) (centry->cc_data + FBA_SIZE(st_cblk_off)), char *, *(int64_t *) (centry->cc_data + FBA_SIZE(st_cblk_off + st_cblk_len) - 8)); } } cblk++; } else if (fba_len == (nsc_size_t)end_cblk_len) { lentry->cc_chain = centry; fba_len -= end_cblk_len; if (flag & NSC_RDBUF) { if (ioent == NULL) { if (!SDBC_VALID_BITS(0, end_cblk_len, centry)) { io_pos = BLK_TO_FBA_NUM(cblk); ioent = centry; } else { DATA_LOG(SDF_ALLOC, centry, 0, end_cblk_len); DTRACE_PROBE4(_sd_alloc_data2, uint64_t, BLK_TO_FBA_NUM(cblk), int, end_cblk_len, char *, *(int64_t *) (centry->cc_data), char *, *(int64_t *) (centry->cc_data + FBA_SIZE(end_cblk_len) - 8)); } } } } else { lentry->cc_chain = centry; lentry = centry; fba_len -= BLK_FBAS; if (flag & NSC_RDBUF) { if (ioent == NULL) { if (!FULLY_VALID(centry)) { io_pos = BLK_TO_FBA_NUM(cblk); ioent = centry; } else { DATA_LOG(SDF_ALLOC, centry, 0, BLK_FBAS); DTRACE_PROBE4(_sd_alloc_data3, uint64_t, (uint64_t) BLK_TO_FBA_NUM(cblk), int, BLK_FBAS, char *, *(int64_t *) (centry->cc_data), char *, *(int64_t *) (centry->cc_data + FBA_SIZE(BLK_FBAS) - 8)); } } } cblk++; } /* if this block has a new identity clear prefetch history */ if (this_entry_type != HASH_ENTRY_DM) centry->cc_aging_dm &= ~(PREFETCH_BUF_I | PREFETCH_BUF_E); centry->cc_aging_dm &= ~(ENTRY_FIELD_DM); centry->cc_aging_dm |= this_entry_type; if (flag & NSC_METADATA) centry->cc_aging_dm |= STICKY_METADATA_DM; --dmchain_request_blocks; } while (fba_len); if (locked) { rw_exit(&sdbc_queue_lock); locked = 0; } ASSERT(dmchain_request_blocks == 0); /* * do any necessary cleanup now that all the blocks are allocated. */ sdbc_centry_alloc_end(&alloc_tok); /* be sure you nul term. the chain */ centry->cc_chain = NULL; /* * step one: establish HOST/PARASITE/OTHER relationships * between the centry ele in the list and calc the alloc size * (fill in CATAGORY based on TYPE and immediate neighbors) */ if (sts = _sd_setup_category_on_type(handle->bh_centry)) { #ifdef DEBUG err = _sd_free_buf(handle); if (err) { cmn_err(CE_WARN, "!sdbc(_sd_alloc_buf): _sd_free_buf " "failed: err:%d handle:%p", err, (void *)handle); } #else (void) _sd_free_buf(handle); #endif goto done; } /* * step two: alloc the needed mem and fill in the data and chaining * fields (leave bufvec for step three) */ (void) _sd_setup_mem_chaining(handle->bh_centry, 0); /* * step three: do the bufvec */ fba_len = fba_orig_len; centry = handle->bh_centry; bufvec = handle->bh_bufvec; while (centry) { DTRACE_PROBE3(_sd_alloc_buf_centrys, _sd_cctl_t *, centry, int, cd, uint64_t, (uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(centry))); if (fba_len == fba_orig_len) { bufvec->bufaddr = (centry->cc_data + FBA_SIZE(st_cblk_off)); bufvec->bufvmeaddr = 0; /* not used */ bufvec->buflen = FBA_SIZE(st_cblk_len); bufvec++; fba_len -= st_cblk_len; } else if (fba_len == (nsc_size_t)end_cblk_len) { _sd_bufvec_t *pbufvec = bufvec - 1; if ((pbufvec->bufaddr + pbufvec->buflen) == centry->cc_data) { /* contiguous */ pbufvec->buflen += FBA_SIZE(end_cblk_len); } else { bufvec->bufaddr = centry->cc_data; bufvec->bufvmeaddr = 0; /* not used */ bufvec->buflen = FBA_SIZE(end_cblk_len); bufvec++; } fba_len -= end_cblk_len; } else { _sd_bufvec_t *pbufvec = bufvec - 1; if ((pbufvec->bufaddr + pbufvec->buflen) == centry->cc_data) { /* contiguous */ pbufvec->buflen += CACHE_BLOCK_SIZE; } else { bufvec->bufaddr = centry->cc_data; bufvec->bufvmeaddr = 0; /* not used */ bufvec->buflen = CACHE_BLOCK_SIZE; bufvec++; } fba_len -= BLK_FBAS; } centry = centry->cc_chain; } /* be sure you nul term. the chain */ bufvec->bufaddr = NULL; bufvec->bufvmeaddr = 0; bufvec->buflen = 0; /* frag statistics */ { _sd_bufvec_t *tbufvec; for (tbufvec = handle->bh_bufvec; tbufvec != bufvec; ++tbufvec) { if ((min_frag > tbufvec->buflen) || (min_frag == 0)) min_frag = tbufvec->buflen; if (max_frag < tbufvec->buflen) max_frag = tbufvec->buflen; } nfrags = bufvec - handle->bh_bufvec; min_frag = FBA_LEN(min_frag); max_frag = FBA_LEN(max_frag); } /* buffer memory frag stats */ DTRACE_PROBE4(_sd_alloc_buf_frag, uint64_t, (uint64_t)fba_orig_len, int, nfrags, int, min_frag, int, max_frag); if (flag & NSC_WRBUF) { if (_SD_IS_WRTHRU(handle)) goto alloc_done; if (_sd_alloc_write(handle->bh_centry, &stall)) { _sd_unblock(&_sd_flush_cv); handle->bh_flag |= NSC_FORCED_WRTHRU; } else { for (centry = handle->bh_centry; centry; centry = centry->cc_chain) { CENTRY_SET_FTPOS(centry); SSOP_SETCENTRY(sdbc_safestore, centry->cc_write); } } } alloc_done: if (locked) { rw_exit(&sdbc_queue_lock); locked = 0; } if (ioent) { _SD_DISCONNECT_CALLBACK(handle); sts = _sd_doread(handle, ioent, io_pos, (fba_pos + fba_orig_len - io_pos), flag); if (sts > 0) (void) _sd_free_buf(handle); } else if (flag & NSC_RDBUF) { CACHE_FBA_READ(cd, fba_orig_len); CACHE_READ_HIT; FBA_READ_IO_KSTATS(cd, FBA_SIZE(fba_orig_len)); sts = NSC_HIT; } else sts = (stall) ? NSC_DONE : NSC_HIT; SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_orig_len, fba_pos, flag, sts); done: if (locked) rw_exit(&sdbc_queue_lock); KSTAT_RUNQ_EXIT(cd); return (sts); } /* * consistency checking for ccents */ #define ELIGIBLE(p) (p & ELIGIBLE_ENTRY_DM) #define HOLD(p) (p & HOLD_ENTRY_DM) #define HASHE(p) (p & HASH_ENTRY_DM) #define HOST(p) (p & HOST_ENTRY_DM) #define PARA(p) (p & PARASITIC_ENTRY_DM) #define OTHER(p) \ (!(p & (HOST_ENTRY_DM | PARASITIC_ENTRY_DM | ELIGIBLE_ENTRY_DM))) #define AVAIL(p) (p & AVAIL_ENTRY_DM) /* * sdbc_check_cctl_cot -- consistency check for _sd_setup_category_on_type() * may only be called on entry to state machine (when ccent is either * ELIGIBLE_ENTRY_DM, HOLD_ENTRY_DM or HASH_ENTRY_DM). * * print message or panic (DEBUG) if inconsistency detected. */ static int sdbc_check_cctl_cot(_sd_cctl_t *centry) { uint_t age; int size; uchar_t *data; int host_or_other; int para; int ccent_ok = 1; age = centry->cc_aging_dm; size = centry->cc_alloc_size_dm; data = centry->cc_data; host_or_other = size && data; para = !size && data; /* * on entry to _sd_setup_category_on_type(), * one of three mutually exclusive entry field bits must be set */ switch ((age & (ELIGIBLE_ENTRY_DM | HOLD_ENTRY_DM | HASH_ENTRY_DM))) { case ELIGIBLE_ENTRY_DM: case HOLD_ENTRY_DM: case HASH_ENTRY_DM: /* ok */ break; default: /* zero or multiple flag bits */ ccent_ok = 0; break; } /* categories are mutually exclusive */ if (HOST(age) && PARA(age)) ccent_ok = 0; /* these bits should be cleared out (STICKY_METADATA_DM not used) */ if (age & (AVAIL_ENTRY_DM | FOUND_HOLD_OVER_DM | FOUND_IN_HASH_DM | STICKY_METADATA_DM)) ccent_ok = 0; /* eligible has no data and no size */ if (ELIGIBLE(age) && (size || data)) ccent_ok = 0; /* parasite has zero size and non-zero data */ if (PARA(age) && !para) ccent_ok = 0; /* host has non-zero size and non-zero data */ if (HOST(age) && !host_or_other) ccent_ok = 0; /* "other" is just like a host */ if (OTHER(age) && !host_or_other) ccent_ok = 0; /* a HOLD or a HASH must have a size */ if ((size) && !(age & (HASH_ENTRY_DM | HOLD_ENTRY_DM))) ccent_ok = 0; if (!ccent_ok) cmn_err(cmn_level, "!sdbc(sdbc_check_cctl_cot): inconsistent ccent %p " "age %x size %d data %p", (void *)centry, age, size, (void *)data); return (ccent_ok); } /* * sdbc_mark_cctl_cot -- mark cctls bad and invalidate when * inconsistency found in _sd_setup_category_on_type() * returns nothing * * Note: this is an error recovery path that is triggered when an * inconsistency in a cctl is detected. _sd_centry_release() will take * these cache entries out of circulation and place them on a separate list * for debugging purposes. */ void sdbc_mark_cctl_cot(_sd_cctl_t *header, _sd_cctl_t *centry) { _sd_cctl_t *cur_ent = header; /* the entire chain is guilty by association */ while (cur_ent) { (void) _sd_hash_delete((struct _sd_hash_hd *)cur_ent, _sd_htable); cur_ent->cc_aging_dm |= BAD_CHAIN_DM; cur_ent = cur_ent->cc_chain; } centry->cc_aging_dm |= BAD_ENTRY_DM; /* this is the problem child */ } /* * _sd_setup_category_on_type(_sd_cctl_t *) - Setup the centry CATEGORY based on * centry TYPE and immediate neighbors. Identify each eligible (ie not HASH) * centry as a host/parasite. host actually have memory allocated to * them and parasites are chained to the host and point to page offsets within * the host's memory. * * RETURNS: * 0 on success, EINTR if inconsistency detected in centry * * Note: * none */ static int _sd_setup_category_on_type(_sd_cctl_t *header) { _sd_cctl_t *prev_ent, *next_ent, *centry; _sd_cctl_t *anchor = NULL; int current_pest_count, local_max_dyn_list; int cl; int ret = 0; ASSERT(header); if (sdbc_use_dmchain) local_max_dyn_list = max_dm_queues - 1; else { /* pickup a fresh copy - has the world changed */ local_max_dyn_list = dynmem_processing_dm.max_dyn_list; } prev_ent = 0; centry = header; next_ent = centry->cc_chain; current_pest_count = 0; cl = 2; /* try to recover from bad cctl */ if (sdbc_check_cot && !sdbc_check_cctl_cot(centry)) ret = EINTR; while (cl && (ret == 0)) { switch (cl) { case (1): /* chain to next/monitor for completion */ prev_ent = centry; centry = next_ent; next_ent = 0; cl = 0; if (centry) { if (sdbc_check_cot && !sdbc_check_cctl_cot(centry)) { ret = EINTR; break; } next_ent = centry->cc_chain; cl = 2; } break; case (2): /* vector to appropriate routine */ if (!(centry->cc_aging_dm & ELIGIBLE_ENTRY_DM)) cl = 5; else if (prev_ent && (prev_ent->cc_aging_dm & ELIGIBLE_ENTRY_DM)) cl = 15; else cl = 10; break; case (5): /* process NON-ELIGIBLE entries */ if (!(centry->cc_aging_dm & (HASH_ENTRY_DM|HOLD_ENTRY_DM))) { /* no catagory */ /* consistency check */ if (centry->cc_alloc_size_dm || centry->cc_data) { cmn_err(cmn_level, "!sdbc(setup_cot): " "OTHER with data/size %p", (void *)centry); ret = EINTR; break; } centry->cc_aging_dm &= ~CATAGORY_ENTRY_DM; centry->cc_alloc_size_dm = BLK_SIZE(1); DTRACE_PROBE1(_sd_setup_category, _sd_cctl_t *, centry); } cl = 1; break; /* * no prev entry (ie top of list) or no prev * ELIGIBLE entry */ case (10): /* * this is an eligible entry, does it start * a list or is it a loner */ /* consistency check */ if (centry->cc_alloc_size_dm || centry->cc_data) { cmn_err(cmn_level, "!sdbc(setup_cot): " "HOST with data/size %p", (void *)centry); ret = EINTR; break; } if (next_ent && (next_ent->cc_aging_dm & ELIGIBLE_ENTRY_DM)) { /* it starts a list */ /* host catagory */ centry->cc_aging_dm |= HOST_ENTRY_DM; /* start out with one page */ centry->cc_alloc_size_dm = BLK_SIZE(1); anchor = centry; DTRACE_PROBE1(_sd_setup_category, _sd_cctl_t *, anchor); cl = 1; } else { /* * it's a loner * drop status to no category and * restart */ cl = 2; centry->cc_aging_dm &= ~ELIGIBLE_ENTRY_DM; } break; case (15): /* default to parasite catagory */ /* consistency check */ if (centry->cc_alloc_size_dm || centry->cc_data) { cmn_err(cmn_level, "!sdbc(setup_cot): " "PARA with data/size %p", (void *)centry); ret = EINTR; break; } if (current_pest_count < local_max_dyn_list-1) { /* continue to grow the pest list */ current_pest_count++; centry->cc_aging_dm |= PARASITIC_ENTRY_DM; /* * offset of host ent mem this will pt * to */ centry->cc_alloc_size_dm = anchor->cc_alloc_size_dm; /* * up the host mem req by one for * this parasite */ DTRACE_PROBE1(_sd_setup_category, _sd_cctl_t *, centry); anchor->cc_alloc_size_dm += BLK_SIZE(1); cl = 1; } else { /* * term this pest list - restart fresh * on this entry */ current_pest_count = 0; prev_ent->cc_aging_dm &= ~(HOST_ENTRY_DM|ELIGIBLE_ENTRY_DM); cl = 2; } break; } /* switch(cl) */ } /* while (cl) */ if (ret != 0) sdbc_mark_cctl_cot(header, centry); return (ret); } /* * _sd_setup_mem_chaining(_sd_cctl_t *) - Allocate memory, setup * mem ptrs an host/pest chaining. Do the actual allocation as described in * sd_setup_category_on_type(). * * RETURNS: * 0 on success * non-zero on error * * Note: * if called with ALLOC_NOWAIT, caller must check for non-zero return */ static int _sd_setup_mem_chaining(_sd_cctl_t *header, int flag) { _sd_cctl_t *prev_ent, *next_ent, *centry; _sd_cctl_t *anchor = NULL; int cl, rc = 0; ASSERT(header); if (!header) return (0); prev_ent = 0; centry = header; next_ent = centry->cc_chain; cl = 2; while (cl) { switch (cl) { case (1): /* chain to next/monitor for completion */ centry->cc_aging_dm &= ~ELIGIBLE_ENTRY_DM; prev_ent = centry; centry = next_ent; next_ent = 0; cl = 0; if (centry) { next_ent = centry->cc_chain; cl = 2; } break; case (2): /* vector to appropriate routine */ if (centry->cc_aging_dm & HOST_ENTRY_DM) cl = 10; else if (centry->cc_aging_dm & PARASITIC_ENTRY_DM) cl = 15; else cl = 5; break; case (5): /* OTHER processing - alloc mem */ if (rc = sdbc_centry_memalloc_dm(centry, centry->cc_alloc_size_dm, flag)) /* The allocation failed */ cl = 0; else cl = 1; break; /* * HOST entry processing - save the anchor pt, * alloc the memory, */ case (10): /* setup head and nxt ptrs */ anchor = centry; if (rc = sdbc_centry_memalloc_dm(centry, centry->cc_alloc_size_dm, flag)) /* The allocation failed */ cl = 0; else cl = 1; break; /* * PARASITIC entry processing - setup w/no * memory, setup head/next ptrs, */ case (15): /* * fudge the data mem ptr to an offset from * the anchor alloc */ if (!(centry->cc_aging_dm & (HASH_ENTRY_DM| HOLD_ENTRY_DM))) { centry->cc_head_dm = anchor; /* chain prev to this */ prev_ent->cc_next_dm = centry; /* * generate the actual data ptr into * host entry memory */ centry->cc_data = anchor->cc_data + centry->cc_alloc_size_dm; centry->cc_alloc_size_dm = 0; } cl = 1; break; } /* switch(cl) */ } /* while (cl) */ return (rc); } /* * _sd_check_buffer_alloc - Check if buffer allocation is invalid. * * RETURNS: * 0 if its ok to continue with allocation. * Else errno to be returned to the user. * * Note: * This routine could block if the device is not local and * recovery is in progress. */ /* ARGSUSED */ static int _sd_check_buffer_alloc(int cd, nsc_off_t fba_pos, nsc_size_t fba_len, _sd_buf_handle_t **hp) { /* * This check exists to ensure that someone will not pass in an * arbitrary pointer and try to pass it off as a handle. */ if ((*hp)->bh_flag & (~_SD_VALID_FLAGS)) { cmn_err(CE_WARN, "!sdbc(_sd_check_buffer_alloc) " "cd %d invalid handle %p flags %x", cd, (void *)*hp, (*hp)->bh_flag); return (EINVAL); } if ((_sd_cache_initialized == 0) || (FILE_OPENED(cd) == 0)) { cmn_err(CE_WARN, "!sdbc(_sd_check_buffer_alloc) " "cd %d not open. Cache init %d", cd, _sd_cache_initialized); return (EINVAL); } ASSERT(cd >= 0); if (!(_sd_cache_files[cd].cd_rawfd) || !nsc_held(_sd_cache_files[cd].cd_rawfd)) { cmn_err(CE_WARN, "!sdbc(_sd_check_buffer_alloc) cd %d is not attached", cd); return (EINVAL); } ASSERT_IO_SIZE(fba_pos, fba_len, cd); ASSERT_LEN(fba_len); return (0); } /* * sdbc_check_handle -- check that handle is valid * return 1 if ok, 0 otherwise (if debug then panic). */ static int sdbc_check_handle(_sd_buf_handle_t *handle) { int ret = 1; if (!_SD_HANDLE_ACTIVE(handle)) { cmn_err(cmn_level, "!sdbc(_sd_free_buf): invalid handle %p" "cd %d fpos %" NSC_SZFMT " flen %" NSC_SZFMT " flag %x", (void *)handle, HANDLE_CD(handle), handle->bh_fba_pos, handle->bh_fba_len, handle->bh_flag); ret = 0; } return (ret); } /* * _sd_free_buf - Free the buffers allocated in _sd_alloc_buf. * * ARGUMENTS: * handle - The handle allocated in _sd_alloc_buf. * * RETURNS: * 0 on success. * Else errno. * * NOTE: * If handle was allocated through _sd_alloc_buf, the handle allocated * flag (NSC_HALLOCATED) will be reset by _sd_alloc_buf. This indicates * that _sd_free_buf should free up the handle as well. * All other handles directly allocated from _sd_alloc_handle will have * that flag set. Any handle with valid blocks will have the handle * active flag. It is an error if the active flag is not set. * (if free_buf were called without going through alloc_buf) */ int _sd_free_buf(_sd_buf_handle_t *handle) { _sd_cctl_t *centry, *cc_chain; int cd = HANDLE_CD(handle); int flen = handle->bh_fba_len; int fpos = handle->bh_fba_pos; SDTRACE(ST_ENTER|SDF_FREEBUF, HANDLE_CD(handle), handle->bh_fba_len, handle->bh_fba_pos, 0, 0); if (sdbc_check_handle(handle) == 0) return (EINVAL); if (handle->bh_flag & NSC_MIXED) { /* * Data in this handle will be a mix of data from the * source device and data from another device, so * invalidate all the blocks. */ handle->bh_flag &= ~NSC_QUEUE; centry = handle->bh_centry; while (centry) { centry->cc_valid = 0; centry = centry->cc_chain; } } if ((handle->bh_flag & NSC_QUEUE)) { handle->bh_flag &= ~NSC_QUEUE; _sd_queue_write(handle, handle->bh_fba_pos, handle->bh_fba_len); } handle->bh_flag &= ~NSC_HACTIVE; centry = handle->bh_centry; while (centry) { cc_chain = centry->cc_chain; _sd_centry_release(centry); centry = cc_chain; } /* * help prevent dup call to _sd_centry_release if this handle * is erroneously _sd_free_buf'd twice. (should not happen). */ handle->bh_centry = NULL; if ((handle->bh_flag & NSC_HALLOCATED) == 0) { handle->bh_flag |= NSC_HALLOCATED; (void) _sd_free_handle(handle); } else { handle->bh_flag = NSC_HALLOCATED; } SDTRACE(ST_EXIT|SDF_FREEBUF, cd, flen, fpos, 0, 0); return (0); } static int _sd_lruq_srch = 0x2000; /* * sdbc_get_dmchain -- get a candidate centry chain pointing to * contiguous memory * ARGUMENTS: * cblocks - number of cache blocks requested * stall - pointer to stall count (no blocks avail) * flag - ALLOC_NOWAIT flag * * RETURNS: * a cache entry or possible NULL if ALLOC_NOWAIT set * USAGE: * attempt to satisfy entire request from queue * that has no memory allocated. * if this fails then attempt a partial allocation * with a preallocated block of requested size up to * max_dyn_list. * then look for largest chain less than max_dyn_list. */ static _sd_cctl_t * sdbc_get_dmchain(int cblocks, int *stall, int flag) { _sd_cctl_t *cc_dmchain = NULL; _sd_queue_t *q; _sd_cctl_t *qhead; int num_tries; int cblocks_orig = cblocks; int nowait = flag & ALLOC_NOWAIT; int i; num_tries = _sd_lruq_srch; ASSERT(cblocks != 0); while (!cc_dmchain) { /* get it from the os if possible */ q = &sdbc_dm_queues[0]; qhead = &(q->sq_qhead); if (q->sq_inq >= cblocks) { mutex_enter(&q->sq_qlock); if (q->sq_inq >= cblocks) { _sd_cctl_t *cc_ent; cc_dmchain = qhead->cc_next; /* * set the inuse and pageio bits * Note: this code expects the cc_ent to * be available. no other thread may set the * inuse or pageio bit for an entry on the * 0 queue. */ cc_ent = qhead; for (i = 0; i < cblocks; ++i) { cc_ent = cc_ent->cc_next; if (SET_CENTRY_INUSE(cc_ent)) { cmn_err(CE_PANIC, "centry inuse on 0 q! %p", (void *)cc_ent); } if (SET_CENTRY_PAGEIO(cc_ent)) { cmn_err(CE_PANIC, "centry pageio on 0 q! %p", (void *)cc_ent); } } /* got a dmchain */ /* remove this chain from the 0 queue */ cc_dmchain->cc_prev->cc_next = cc_ent->cc_next; cc_ent->cc_next->cc_prev = cc_dmchain->cc_prev; cc_dmchain->cc_prev = NULL; cc_ent->cc_next = NULL; q->sq_inq -= cblocks; ASSERT(GOOD_LRUSIZE(q)); } mutex_exit(&q->sq_qlock); if (cc_dmchain) continue; } /* look for a pre-allocated block of the requested size */ if (cblocks > (max_dm_queues - 1)) cblocks = max_dm_queues - 1; q = &sdbc_dm_queues[cblocks]; qhead = &(q->sq_qhead); if (q->sq_inq != 0) { _sd_cctl_t *tmp_dmchain; mutex_enter(&q->sq_qlock); for (tmp_dmchain = qhead->cc_next; tmp_dmchain != qhead; tmp_dmchain = tmp_dmchain->cc_next) { /* * get a dmchain * set the inuse and pageio bits */ if (sdbc_dmchain_avail(tmp_dmchain)) { /* put on MRU end of queue */ sdbc_requeue_dmchain(q, tmp_dmchain, 1, 0); cc_dmchain = tmp_dmchain; break; } sdbc_dmchain_not_avail++; } mutex_exit(&q->sq_qlock); if (cc_dmchain) continue; } /* * spin block * nudge the deallocator, accelerate ageing */ mutex_enter(&dynmem_processing_dm.thread_dm_lock); cv_broadcast(&dynmem_processing_dm.thread_dm_cv); mutex_exit(&dynmem_processing_dm.thread_dm_lock); if (nowait) break; if (!(--num_tries)) { delay(drv_usectohz(20000)); (void) (*stall)++; num_tries = _sd_lruq_srch; cblocks = cblocks_orig; } else { /* see if smaller request size is available */ if (!(--cblocks)) cblocks = cblocks_orig; } } /* while (!cc_dmchain) */ return (cc_dmchain); } static int sdbc_dmchain_avail(_sd_cctl_t *cc_ent) { int chain_avail = 1; _sd_cctl_t *anchor = cc_ent; while (cc_ent) { ASSERT(_sd_cctl_valid(cc_ent)); if (cc_ent->cc_aging_dm & BAD_CHAIN_DM) { chain_avail = 0; break; } if (CENTRY_DIRTY(cc_ent)) { chain_avail = 0; break; } if (SET_CENTRY_INUSE(cc_ent)) { chain_avail = 0; break; } if ((SET_CENTRY_PAGEIO(cc_ent))) { CLEAR_CENTRY_INUSE(cc_ent); chain_avail = 0; break; } if (CENTRY_DIRTY(cc_ent)) { CLEAR_CENTRY_PAGEIO(cc_ent); CLEAR_CENTRY_INUSE(cc_ent); chain_avail = 0; break; } cc_ent->cc_flag = 0; cc_ent->cc_toflush = 0; cc_ent = cc_ent->cc_next_dm; } if (!chain_avail) sdbc_clear_dmchain(anchor, cc_ent); else { cc_ent = anchor; /* * prevent possible deadlocks in _sd_cc_wait(): * remove from hash and wakeup any waiters now that we * have acquired the chain. */ while (cc_ent) { (void) _sd_hash_delete((struct _sd_hash_hd *)cc_ent, _sd_htable); mutex_enter(&cc_ent->cc_lock); if (cc_ent->cc_await_use) { cv_broadcast(&cc_ent->cc_blkcv); } mutex_exit(&cc_ent->cc_lock); cc_ent->cc_creat = nsc_lbolt(); cc_ent->cc_hits = 0; cc_ent = cc_ent->cc_next_dm; } } return (chain_avail); } static void sdbc_clear_dmchain(_sd_cctl_t *cc_ent_start, _sd_cctl_t *cc_ent_end) { _sd_cctl_t *cc_ent = cc_ent_start; _sd_cctl_t *prev_ent; ASSERT(_sd_cctl_valid(cc_ent)); while (cc_ent != cc_ent_end) { ASSERT(_sd_cctl_valid(cc_ent)); prev_ent = cc_ent; cc_ent = cc_ent->cc_next_dm; CLEAR_CENTRY_PAGEIO(prev_ent); CLEAR_CENTRY_INUSE(prev_ent); } } /* * put a dmchain on the LRU end of a queue */ void sdbc_ins_dmqueue_front(_sd_queue_t *q, _sd_cctl_t *cc_ent) { _sd_cctl_t *qhead = &(q->sq_qhead); ASSERT(_sd_cctl_valid(cc_ent)); mutex_enter(&q->sq_qlock); cc_ent->cc_next = qhead->cc_next; cc_ent->cc_prev = qhead; qhead->cc_next->cc_prev = cc_ent; qhead->cc_next = cc_ent; q->sq_inq++; cc_ent->cc_cblocks = q->sq_dmchain_cblocks; ASSERT(GOOD_LRUSIZE(q)); mutex_exit(&q->sq_qlock); } /* * put a dmchain on the MRU end of a queue */ static void sdbc_ins_dmqueue_back(_sd_queue_t *q, _sd_cctl_t *cc_ent) { _sd_cctl_t *qhead = &(q->sq_qhead); ASSERT(_sd_cctl_valid(cc_ent)); mutex_enter(&q->sq_qlock); cc_ent->cc_next = qhead; cc_ent->cc_prev = qhead->cc_prev; qhead->cc_prev->cc_next = cc_ent; qhead->cc_prev = cc_ent; cc_ent->cc_seq = q->sq_seq++; q->sq_inq++; cc_ent->cc_cblocks = q->sq_dmchain_cblocks; ASSERT(GOOD_LRUSIZE(q)); mutex_exit(&q->sq_qlock); } /* * remove dmchain from a queue */ void sdbc_remq_dmchain(_sd_queue_t *q, _sd_cctl_t *cc_ent) { ASSERT(_sd_cctl_valid(cc_ent)); mutex_enter(&q->sq_qlock); cc_ent->cc_prev->cc_next = cc_ent->cc_next; cc_ent->cc_next->cc_prev = cc_ent->cc_prev; cc_ent->cc_next = cc_ent->cc_prev = NULL; /* defensive programming */ cc_ent->cc_cblocks = -1; /* indicate not on any queue */ q->sq_inq--; ASSERT(GOOD_LRUSIZE(q)); mutex_exit(&q->sq_qlock); } /* * requeue a dmchain to the MRU end of its queue. * if getlock is 0 on entry the queue lock (sq_qlock) must be held */ void sdbc_requeue_dmchain(_sd_queue_t *q, _sd_cctl_t *cc_ent, int mru, int getlock) { _sd_cctl_t *qhead = &(q->sq_qhead); ASSERT(_sd_cctl_valid(cc_ent)); if (getlock) mutex_enter(&q->sq_qlock); /* inline of sdbc_remq_dmchain() */ cc_ent->cc_prev->cc_next = cc_ent->cc_next; cc_ent->cc_next->cc_prev = cc_ent->cc_prev; if (mru) { /* put on MRU end of queue */ /* inline of sdbc_ins_dmqueue_back */ cc_ent->cc_next = qhead; cc_ent->cc_prev = qhead->cc_prev; qhead->cc_prev->cc_next = cc_ent; qhead->cc_prev = cc_ent; cc_ent->cc_seq = q->sq_seq++; (q->sq_req_stat)++; } else { /* put on LRU end of queue i.e. requeue to head */ /* inline of sdbc_ins_dmqueue_front */ cc_ent->cc_next = qhead->cc_next; cc_ent->cc_prev = qhead; qhead->cc_next->cc_prev = cc_ent; qhead->cc_next = cc_ent; cc_ent->cc_seq = q->sq_seq++; /* * clear the CC_QHEAD bit on all members of the chain */ { _sd_cctl_t *tcent; for (tcent = cc_ent; tcent; tcent = tcent->cc_next_dm) tcent->cc_flag &= ~CC_QHEAD; } } if (getlock) mutex_exit(&q->sq_qlock); } /* * sdbc_dmchain_dirty(cc_ent) * return first dirty cc_ent in dmchain, NULL if chain is not dirty */ static _sd_cctl_t * sdbc_dmchain_dirty(_sd_cctl_t *cc_ent) { for (/* CSTYLED */; cc_ent; cc_ent = cc_ent->cc_next_dm) if (CENTRY_DIRTY(cc_ent)) break; return (cc_ent); } /* * sdbc_requeue_head_dm_try() * attempt to requeue a dmchain to the head of the queue */ void sdbc_requeue_head_dm_try(_sd_cctl_t *cc_ent) { int qidx; _sd_queue_t *q; if (!sdbc_dmchain_dirty(cc_ent)) { qidx = cc_ent->cc_cblocks; q = &sdbc_dm_queues[qidx]; sdbc_requeue_dmchain(q, cc_ent, 0, 1); /* requeue head */ } } /* * sdbc_centry_alloc_blks -- allocate cache entries with memory * * ARGUMENTS: * cd - Cache descriptor (from a previous open) * cblk - cache block number. * reqblks - number of cache blocks to be allocated * flag - can be ALLOC_NOWAIT * RETURNS: * A cache block chain or NULL if ALLOC_NOWAIT and request fails * * Note: caller must check for null return if called with * ALLOC_NOWAIT set. */ _sd_cctl_t * sdbc_centry_alloc_blks(int cd, nsc_off_t cblk, nsc_size_t reqblks, int flag) { sdbc_allocbuf_t alloc_tok = {0}; /* must be 0 */ int stall = 0; _sd_cctl_t *centry = NULL; _sd_cctl_t *lentry = NULL; _sd_cctl_t *anchor = NULL; _sd_cctl_t *next_centry; ASSERT(reqblks); while (reqblks) { centry = sdbc_centry_alloc(cd, cblk, reqblks, &stall, &alloc_tok, flag); if (!centry) break; centry->cc_chain = NULL; if (lentry == NULL) anchor = centry; else lentry->cc_chain = centry; lentry = centry; centry->cc_aging_dm &= ~(ENTRY_FIELD_DM); if (centry->cc_aging_dm & FOUND_IN_HASH_DM) centry->cc_aging_dm |= HASH_ENTRY_DM; else if (centry->cc_aging_dm & FOUND_HOLD_OVER_DM) centry->cc_aging_dm |= HOLD_ENTRY_DM; else centry->cc_aging_dm |= ELIGIBLE_ENTRY_DM; centry->cc_aging_dm &= ~(FOUND_IN_HASH_DM|FOUND_HOLD_OVER_DM); --reqblks; } sdbc_centry_alloc_end(&alloc_tok); if (reqblks || (_sd_setup_category_on_type(anchor))) { centry = anchor; while (centry) { next_centry = centry->cc_chain; _sd_centry_release(centry); centry = next_centry; } anchor = NULL; } else /* This is where the memory is actually allocated */ if (_sd_setup_mem_chaining(anchor, flag)) anchor = NULL; return (anchor); } /* * sdbc_centry_alloc - sdbc internal function to allocate a new cache block. * * ARGUMENTS: * cd - Cache descriptor (from a previous open) * cblk - cache block number. * stall - pointer to stall count (no blocks avail) * req_blocks - number of cache blocks remaining in caller's i/o request * alloc_tok - pointer to token initialized to 0 on first call to function * flag - lock status of sdbc_queue_lock or ALLOC_NOWAIT flag * RETURNS: * A cache block, or possibly NULL if ALLOC_NOWAIT set . * * USAGE: * switch to the appropriate allocation function. * this function is used when callers need more than one cache block. * it is called repeatedly until the entire request is satisfied, * at which time the caller will then do the memory allocation. * if only one cache block is needed callers may use * sdbc_centry_alloc_blks() which also allocates memory. * * Note: caller must check for null return if called with * ALLOC_NOWAIT set. */ _sd_cctl_t * sdbc_centry_alloc(int cd, nsc_off_t cblk, nsc_size_t req_blocks, int *stall, sdbc_allocbuf_t *alloc_tok, int flag) { _sd_cctl_t *centry; if (sdbc_use_dmchain) centry = sdbc_alloc_dmc(cd, cblk, req_blocks, stall, alloc_tok, flag); else centry = sdbc_alloc_lru(cd, cblk, stall, flag); return (centry); } /* * sdbc_alloc_dmc -- allocate a centry from a dmchain * * ARGUMENTS: * cd - Cache descriptor (from a previous open) * cblk - cache block number. * stall - pointer to stall count (no blocks avail) * req_blocks - number of cache blocks in clients i/o request * alloc_tok - pointer to token initialized to 0 on first call to function * flag - lock status of sdbc_queue_lock, or ALLOC_NOWAIT flag * RETURNS: * A cache block or possibly NULL if ALLOC_NOWAIT set * * USAGE: * if dmchain is empty, allocate one. */ static _sd_cctl_t * sdbc_alloc_dmc(int cd, nsc_off_t cblk, nsc_size_t req_blocks, int *stall, sdbc_allocbuf_t *alloc_tok, int flag) { sdbc_allocbuf_impl_t *dmc = (sdbc_allocbuf_impl_t *)alloc_tok; _sd_cctl_t *centry = NULL; if (!dmc->sab_dmchain) { /* * Note - sdbc_get_dmchain() returns * with cc_inuse and cc_pageio set * for all members of dmchain. */ if (dmc->sab_dmchain = sdbc_get_dmchain(req_blocks, stall, flag)) { /* remember q it came from */ if (dmc->sab_dmchain->cc_alloc_size_dm) dmc->sab_q = dmc->sab_dmchain->cc_cblocks; } } /* * Note: dmchain pointer is advanced in sdbc_alloc_from_dmchain() */ if (dmc->sab_dmchain) /* could be NULL if ALLOC_NOWAIT set */ centry = sdbc_alloc_from_dmchain(cd, cblk, alloc_tok, flag); return (centry); } /* * sdbc_alloc_from_dmchain -- allocate centry from a dmchain of centrys * * ARGUMENTS: * cd - Cache descriptor (from a previous open) * cblk - cache block number. * alloc_tok - pointer to token * flag - lock status of sdbc_queue_lock or ALLOC_NOWAIT * * RETURNS: * A cache block or possibly NULL if ALLOC_NOWAIT set. * * USAGE: * This routine allocates a new cache block from the supplied dmchain. * Assumes that dmchain is non-NULL and that all cache entries in * the dmchain have been removed from hash and have their cc_inuse and * cc_pageio bits set. */ static _sd_cctl_t * sdbc_alloc_from_dmchain(int cd, nsc_off_t cblk, sdbc_allocbuf_t *alloc_tok, int flag) { _sd_cctl_t *cc_ent, *old_ent; int categorize_centry; int locked = flag & ALLOC_LOCKED; int nowait = flag & ALLOC_NOWAIT; sdbc_allocbuf_impl_t *dmc = (sdbc_allocbuf_impl_t *)alloc_tok; SDTRACE(ST_ENTER|SDF_ENT_ALLOC, cd, 0, BLK_TO_FBA_NUM(cblk), 0, 0); ASSERT(dmc->sab_dmchain); cc_ent = dmc->sab_dmchain; ASSERT(_sd_cctl_valid(cc_ent)); cc_ent->cc_valid = 0; categorize_centry = 0; if (cc_ent->cc_data) categorize_centry = FOUND_HOLD_OVER_DM; alloc_try: if (cd == _CD_NOHASH) CENTRY_BLK(cc_ent) = cblk; else if ((old_ent = (_sd_cctl_t *) _sd_hash_insert(cd, cblk, (struct _sd_hash_hd *)cc_ent, _sd_htable)) != cc_ent) { if (SET_CENTRY_INUSE(old_ent)) { sdbc_centry_inuse++; if (nowait) { cc_ent = NULL; goto out; } if (locked) rw_exit(&sdbc_queue_lock); _sd_cc_wait(cd, cblk, old_ent, CC_INUSE); if (locked) rw_enter(&sdbc_queue_lock, RW_WRITER); goto alloc_try; } /* * bug 4529671 * now that we own the centry make sure that * it is still good. it could have been processed * by _sd_dealloc_dm() in the window between * _sd_hash_insert() and SET_CENTRY_INUSE(). */ if ((_sd_cctl_t *)_sd_hash_search(cd, cblk, _sd_htable) != old_ent) { sdbc_centry_deallocd++; #ifdef DEBUG cmn_err(CE_WARN, "!cc_ent %p cd %d cblk %" NSC_SZFMT " lost to dealloc?! cc_data %p", (void *)old_ent, cd, cblk, (void *)old_ent->cc_data); #endif CLEAR_CENTRY_INUSE(old_ent); if (nowait) { cc_ent = NULL; goto out; } goto alloc_try; } if (CC_CD_BLK_MATCH(cd, cblk, old_ent)) { sdbc_centry_hit++; old_ent->cc_toflush = 0; /* _sd_centry_release(cc_ent); */ cc_ent = old_ent; categorize_centry = FOUND_IN_HASH_DM; } else { sdbc_centry_lost++; CLEAR_CENTRY_INUSE(old_ent); if (nowait) { cc_ent = NULL; goto out; } goto alloc_try; } } /* * advance the dmchain pointer, but only if we got the * cc_ent from the dmchain */ if (categorize_centry != FOUND_IN_HASH_DM) { if (cc_ent->cc_data) dmc->sab_dmchain = dmc->sab_dmchain->cc_next_dm; else dmc->sab_dmchain = dmc->sab_dmchain->cc_next; } SDTRACE(ST_EXIT|SDF_ENT_ALLOC, cd, 0, BLK_TO_FBA_NUM(cblk), 0, 0); mutex_enter(&cc_ent->cc_lock); if (cc_ent->cc_await_use) { cv_broadcast(&cc_ent->cc_blkcv); } mutex_exit(&cc_ent->cc_lock); sdbc_centry_init_dm(cc_ent); cc_ent->cc_aging_dm |= categorize_centry; out: SDTRACE(ST_INFO|SDF_ENT_ALLOC, cd, 0, BLK_TO_FBA_NUM(cblk), 0, 0); return (cc_ent); } /* * sdbc_centry_alloc_end -- tidy up after all cache blocks have been * allocated for a request * ARGUMENTS: * alloc_tok - pointer to allocation token * RETURNS * nothing * USAGE: * at this time only useful when sdbc_use_dmchain is true. * if there are cache blocks remaining on the chain then the inuse and * pageio bits must be cleared (they were set in sdbc_get_dmchain(). * */ static void sdbc_centry_alloc_end(sdbc_allocbuf_t *alloc_tok) { _sd_cctl_t *next_centry; _sd_cctl_t *prev_centry; _sd_queue_t *q; sdbc_allocbuf_impl_t *dmc = (sdbc_allocbuf_impl_t *)alloc_tok; #ifdef DEBUG int chainpull = 0; #endif if (!sdbc_use_dmchain) return; next_centry = dmc->sab_dmchain; while (next_centry != NULL) { CLEAR_CENTRY_PAGEIO(next_centry); prev_centry = next_centry; if (next_centry->cc_data) { #ifdef DEBUG ++chainpull; #endif next_centry = next_centry->cc_next_dm; /* clear bit after final reference */ CLEAR_CENTRY_INUSE(prev_centry); } else { next_centry = next_centry->cc_next; /* * a floater from the 0 queue, insert on q. * * since this centry is not on any queue * the inuse bit can be cleared before * inserting on the q. this is also required * since sdbc_get_dmchain() does not expect * inuse bits to be set on 0 queue entry's. */ CLEAR_CENTRY_INUSE(prev_centry); q = &sdbc_dm_queues[0]; sdbc_ins_dmqueue_front(q, prev_centry); } } #ifdef DEBUG /* compute wastage stats */ ASSERT((chainpull >= 0) && (chainpull < max_dm_queues)); if (chainpull) (*(dmchainpull_table + (dmc->sab_q * max_dm_queues + chainpull)))++; #endif } /* * sdbc_alloc_lru - allocate a new cache block from the lru queue * * ARGUMENTS: * cd - Cache descriptor (from a previous open) * cblk - cache block number. * stall - pointer to stall count (no blocks avail) * flag - lock status of sdbc_queue_lock or ALLOC_NOWAIT * * RETURNS: * A cache block or NULL if ALLOC_NOWAIT specified * * USAGE: * This routine allocates a new cache block from the lru. * If an allocation cannot be done, we block, unless ALLOC_NOWAIT is set. */ static _sd_cctl_t * sdbc_alloc_lru(int cd, nsc_off_t cblk, int *stall, int flag) { _sd_cctl_t *cc_ent, *old_ent, *ccnext; _sd_queue_t *q = _SD_LRU_Q; _sd_cctl_t *qhead = &(q->sq_qhead); int tries = 0, num_tries; int categorize_centry; int locked = flag & ALLOC_LOCKED; int nowait = flag & ALLOC_NOWAIT; if (nowait) { num_tries = q->sq_inq / 100; /* only search 1% of q */ if (num_tries <= 0) /* ensure num_tries is non-zero */ num_tries = q->sq_inq; } else num_tries = _sd_lruq_srch; SDTRACE(ST_ENTER|SDF_ENT_ALLOC, cd, 0, BLK_TO_FBA_NUM(cblk), 0, 0); retry_alloc_centry: for (cc_ent = (qhead->cc_next); cc_ent != qhead; cc_ent = ccnext) { if (--num_tries <= 0) if (nowait) { cc_ent = NULL; goto out; } else break; ccnext = cc_ent->cc_next; if (cc_ent->cc_aging_dm & BAD_CHAIN_DM) continue; if (CENTRY_DIRTY(cc_ent)) continue; if (SET_CENTRY_INUSE(cc_ent)) continue; if (CENTRY_DIRTY(cc_ent)) { sdbc_centry_lost++; CLEAR_CENTRY_INUSE(cc_ent); continue; } cc_ent->cc_flag = 0; /* CC_INUSE */ cc_ent->cc_toflush = 0; /* * Inlined requeue of the LRU. (should match _sd_requeue) */ /* was FAST */ mutex_enter(&q->sq_qlock); #if defined(_SD_DEBUG) if (1) { _sd_cctl_t *cp, *cn, *qp; cp = cc_ent->cc_prev; cn = cc_ent->cc_next; qp = (q->sq_qhead).cc_prev; if (!_sd_cctl_valid(cc_ent) || (cp != &(q->sq_qhead) && !_sd_cctl_valid(cp)) || (cn != &(q->sq_qhead) && !_sd_cctl_valid(cn)) || !_sd_cctl_valid(qp)) cmn_err(CE_PANIC, "_sd_centry_alloc %x prev %x next %x qp %x", cc_ent, cp, cn, qp); } #endif cc_ent->cc_prev->cc_next = cc_ent->cc_next; cc_ent->cc_next->cc_prev = cc_ent->cc_prev; cc_ent->cc_next = qhead; cc_ent->cc_prev = qhead->cc_prev; qhead->cc_prev->cc_next = cc_ent; qhead->cc_prev = cc_ent; cc_ent->cc_seq = q->sq_seq++; /* was FAST */ mutex_exit(&q->sq_qlock); /* * End inlined requeue. */ #if defined(_SD_STATS) if (_sd_hash_delete(cc_ent, _sd_htable) == 0) SDTRACE(SDF_REPLACE, CENTRY_CD(cc_ent), cc_ent->cc_hits, BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)), nsc_lbolt(), cc_ent->cc_creat); cc_ent->cc_creat = nsc_lbolt(); cc_ent->cc_hits = 0; #else #if defined(_SD_DEBUG) if (_sd_hash_delete(cc_ent, _sd_htable) == 0) { SDTRACE(SDF_REPLACE|ST_DL, CENTRY_CD(cc_ent), cc_ent->cc_valid, BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)), cd, BLK_TO_FBA_NUM(cblk)); if (cc_ent->cc_await_use || ((cd == CENTRY_CD(cc_ent)) && (cblk == CENTRY_BLK(cc_ent)))) DATA_LOG(SDF_REPLACE|ST_DL, cc_ent, 0, BLK_FBAS); } #else (void) _sd_hash_delete((struct _sd_hash_hd *)cc_ent, _sd_htable); #endif #endif cc_ent->cc_creat = nsc_lbolt(); cc_ent->cc_hits = 0; cc_ent->cc_valid = 0; categorize_centry = 0; if (cc_ent->cc_data) categorize_centry = FOUND_HOLD_OVER_DM; alloc_try: if (cd == _CD_NOHASH) CENTRY_BLK(cc_ent) = cblk; else if ((old_ent = (_sd_cctl_t *) _sd_hash_insert(cd, cblk, (struct _sd_hash_hd *)cc_ent, _sd_htable)) != cc_ent) { if (SET_CENTRY_INUSE(old_ent)) { sdbc_centry_inuse++; if (nowait) { _sd_centry_release(cc_ent); cc_ent = NULL; goto out; } if (locked) rw_exit(&sdbc_queue_lock); _sd_cc_wait(cd, cblk, old_ent, CC_INUSE); if (locked) rw_enter(&sdbc_queue_lock, RW_WRITER); goto alloc_try; } /* * bug 4529671 * now that we own the centry make sure that * it is still good. it could have been processed * by _sd_dealloc_dm() in the window between * _sd_hash_insert() and SET_CENTRY_INUSE(). */ if ((_sd_cctl_t *) _sd_hash_search(cd, cblk, _sd_htable) != old_ent) { sdbc_centry_deallocd++; #ifdef DEBUG cmn_err(CE_WARN, "!cc_ent %p cd %d cblk %" NSC_SZFMT " lost to dealloc?! cc_data %p", (void *)old_ent, cd, cblk, (void *)old_ent->cc_data); #endif CLEAR_CENTRY_INUSE(old_ent); if (nowait) { _sd_centry_release(cc_ent); cc_ent = NULL; goto out; } goto alloc_try; } if (CC_CD_BLK_MATCH(cd, cblk, old_ent)) { sdbc_centry_hit++; old_ent->cc_toflush = 0; _sd_centry_release(cc_ent); cc_ent = old_ent; categorize_centry = FOUND_IN_HASH_DM; } else { sdbc_centry_lost++; CLEAR_CENTRY_INUSE(old_ent); if (nowait) { _sd_centry_release(cc_ent); cc_ent = NULL; goto out; } goto alloc_try; } } SDTRACE(ST_EXIT|SDF_ENT_ALLOC, cd, tries, BLK_TO_FBA_NUM(cblk), 0, 0); if (cc_ent->cc_await_use) { mutex_enter(&cc_ent->cc_lock); cv_broadcast(&cc_ent->cc_blkcv); mutex_exit(&cc_ent->cc_lock); } sdbc_centry_init_dm(cc_ent); cc_ent->cc_aging_dm |= categorize_centry; out: return (cc_ent); } SDTRACE(ST_INFO|SDF_ENT_ALLOC, cd, ++tries, BLK_TO_FBA_NUM(cblk), 0, 0); delay(drv_usectohz(20000)); (void) (*stall)++; num_tries = _sd_lruq_srch; goto retry_alloc_centry; } /* * sdbc_centry_init_dm - setup the cache block for dynamic memory allocation * * ARGUMENTS: * centry - Cache block. * * RETURNS: * NONE * * USAGE: * This routine is the central point in which cache entry blocks are setup */ static void sdbc_centry_init_dm(_sd_cctl_t *centry) { /* an entry already setup - don't touch simply refresh age */ if (centry->cc_data) { centry->cc_aging_dm &= ~(FINAL_AGING_DM); DTRACE_PROBE1(sdbc_centry_init_dm_end, char *, centry->cc_data); return; } centry->cc_aging_dm &= ~(FINAL_AGING_DM | CATAGORY_ENTRY_DM); if (centry->cc_head_dm || centry->cc_next_dm) cmn_err(cmn_level, "!sdbc(sdbc_centry_init_dm): " "non-zero mem chain in ccent %p", (void *)centry); centry->cc_head_dm = 0; if (!sdbc_use_dmchain) centry->cc_next_dm = 0; centry->cc_data = 0; } /* * sdbc_centry_memalloc_dm * * Actually allocate the cache memory, storing it in the cc_data field for * the cctl * * ARGS: * centry: cache control block for which to allocate the memory * alloc_request: number of bytes to allocate * flag: if called with ALLOC_NOWAIT, caller must check for non-zero return * * RETURNS: * 0 on success * non-zero on error */ static int sdbc_centry_memalloc_dm(_sd_cctl_t *centry, int alloc_request, int flag) { int cblocks; _sd_queue_t *newq; int sleep; sleep = (flag & ALLOC_NOWAIT) ? KM_NOSLEEP : KM_SLEEP; if (!centry->cc_data && (alloc_request > 0)) { /* host or other */ dynmem_processing_dm.alloc_ct++; centry->cc_data = (unsigned char *) kmem_alloc((size_t)centry->cc_alloc_size_dm, sleep); if (sdbc_use_dmchain) { cblocks = centry->cc_alloc_size_dm >> _sd_cblock_shift; newq = &sdbc_dm_queues[cblocks]; /* set the dmqueue index */ centry->cc_cblocks = cblocks; /* put on appropriate queue */ sdbc_ins_dmqueue_back(newq, centry); } /* * for KM_NOSLEEP (should never happen with KM_SLEEP) */ if (!centry->cc_data) return (LOW_RESOURCES_DM); centry->cc_head_dm = centry; centry->cc_alloc_ct_dm++; } return (0); } /* * _sd_centry_release - release a cache block * * ARGUMENTS: * centry - Cache block. * * RETURNS: * NONE * * USAGE: * This routine frees up a cache block. It also frees up a write * block if allocated and its valid to release it. */ void _sd_centry_release(_sd_cctl_t *centry) { ss_centry_info_t *wctl; SDTRACE(ST_ENTER|SDF_ENT_FREE, CENTRY_CD(centry), 0, BLK_TO_FBA_NUM(CENTRY_BLK(centry)), 0, 0); CLEAR_CENTRY_PAGEIO(centry); if ((wctl = centry->cc_write) != 0) { /* was FAST */ mutex_enter(¢ry->cc_lock); if (CENTRY_DIRTY(centry)) wctl = NULL; else { centry->cc_write = NULL; centry->cc_flag &= ~(CC_PINNABLE); } /* was FAST */ mutex_exit(¢ry->cc_lock); if (wctl) { wctl->sc_dirty = 0; SSOP_SETCENTRY(sdbc_safestore, wctl); SSOP_DEALLOCRESOURCE(sdbc_safestore, wctl->sc_res); } } if (!(centry->cc_aging_dm & BAD_CHAIN_DM)) { if (sdbc_use_dmchain) { if (centry->cc_alloc_size_dm) { /* see if this can be queued to head */ if (CENTRY_QHEAD(centry)) { sdbc_requeue_head_dm_try(centry); } else { int qidx; _sd_queue_t *q; qidx = centry->cc_cblocks; q = &sdbc_dm_queues[qidx]; if (_sd_lru_reinsert(q, centry)) { sdbc_requeue_dmchain(q, centry, 1, 1); } } } else { /* * Fix for bug 4949134: * If an internal block is marked with CC_QHEAD * but the HOST block is not, the chain will * never age properly, and will never be made * available. Only the HOST of the dmchain is * checked for CC_QHEAD, so clearing an internal * block indiscriminately (as is being done * here) does no damage. * * The same result could instead be achieved by * not setting the CC_QHEAD flag in the first * place, if the block is an internal dmchain * block, and if it is found in the hash table. * The current solution was chosen since it is * the least intrusive. */ centry->cc_flag &= ~CC_QHEAD; } } else { if (CENTRY_QHEAD(centry)) { if (!CENTRY_DIRTY(centry)) _sd_requeue_head(centry); } else if (_sd_lru_reinsert(_SD_LRU_Q, centry)) _sd_requeue(centry); } } SDTRACE(ST_EXIT|SDF_ENT_FREE, CENTRY_CD(centry), 0, BLK_TO_FBA_NUM(CENTRY_BLK(centry)), 0, 0); /* only clear inuse after final reference to centry */ CLEAR_CENTRY_INUSE(centry); } /* * lookup to centry info associated with safestore resource * return pointer to the centry info structure */ ss_centry_info_t * sdbc_get_cinfo_byres(ss_resource_t *res) { ss_centry_info_t *cinfo; ss_centry_info_t *cend; int found = 0; ASSERT(res != NULL); if (res == NULL) return (NULL); cinfo = _sdbc_gl_centry_info; cend = _sdbc_gl_centry_info + (_sdbc_gl_centry_info_size / sizeof (ss_centry_info_t)) - 1; for (; cinfo <= cend; ++cinfo) if (cinfo->sc_res == res) { ++found; break; } if (!found) cinfo = NULL; /* bad */ return (cinfo); } /* * _sd_alloc_write - Allocate a write block (for remote mirroring) * and set centry->cc_write * * ARGUMENTS: * centry - Head of Cache chain * stall - pointer to stall count (no blocks avail) * * RETURNS: * 0 - and sets cc_write for all entries when write contl block obtained. * -1 - if a write control block could not be obtained. */ int _sd_alloc_write(_sd_cctl_t *centry, int *stall) { ss_resourcelist_t *reslist; ss_resourcelist_t *savereslist; ss_resource_t *res; _sd_cctl_t *ce; int err; int need; need = 0; for (ce = centry; ce; ce = ce->cc_chain) { if (!(ce->cc_write)) need++; } if (!need) return (0); if ((SSOP_ALLOCRESOURCE(sdbc_safestore, need, stall, &reslist)) == SS_OK) { savereslist = reslist; for (ce = centry; ce; ce = ce->cc_chain) { if (ce->cc_write) continue; err = SSOP_GETRESOURCE(sdbc_safestore, &reslist, &res); if (err == SS_OK) ce->cc_write = sdbc_get_cinfo_byres(res); ASSERT(err == SS_OK); /* panic if DEBUG on */ ASSERT(ce->cc_write != NULL); /* * this is bad and should not happen. * we use the saved reslist to cleanup * and return. */ if ((err != SS_OK) || !ce->cc_write) { cmn_err(CE_WARN, "!_sd_alloc_write: " "bad resource list 0x%p" "changing to forced write thru mode", (void *)savereslist); (void) _sd_set_node_hint(NSC_FORCED_WRTHRU); while (SSOP_GETRESOURCE(sdbc_safestore, &savereslist, &res) == SS_OK) { SSOP_DEALLOCRESOURCE(sdbc_safestore, res); } return (-1); } } return (0); } /* no safestore resources available. do sync write */ _sd_unblock(&_sd_flush_cv); return (-1); } /* * _sd_read - Interface call to do read. * * ARGUMENTS: * handle - handle allocated earlier on. * fba_pos - disk block number to read from. * fba_len - length in fbas. * flag - flag: (NSC_NOBLOCK for async io) * * RETURNS: * errno if return > 0 * NSC_DONE or NSC_PENDING otherwise. * * USAGE: * This routine checks if the request is valid and calls the underlying * doread routine (also called by alloc_buf) */ int _sd_read(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len, int flag) { sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */ sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */ sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */ _sd_cctl_t *cc_ent = NULL; nsc_size_t fba_orig_len = fba_len; int ret; int cd = HANDLE_CD(handle); if (_sdbc_shutdown_in_progress || (handle->bh_flag & NSC_ABUF)) { ret = EIO; goto out; } #if !defined(_SD_NOCHECKS) if (!_SD_HANDLE_ACTIVE(handle)) { cmn_err(CE_WARN, "!sdbc(_sd_read) handle %p not active", (void *)handle); ret = EINVAL; goto out; } ASSERT_HANDLE_LIMITS(handle, fba_pos, fba_len); #endif if (fba_len == 0) { ret = NSC_DONE; goto out; } KSTAT_RUNQ_ENTER(cd); st_cblk_off = BLK_FBA_OFF(fba_pos); st_cblk_len = BLK_FBAS - st_cblk_off; if ((nsc_size_t)st_cblk_len >= fba_len) { end_cblk_len = 0; st_cblk_len = (sdbc_cblk_fba_t)fba_len; } else { end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len); } cc_ent = handle->bh_centry; while (CENTRY_BLK(cc_ent) != FBA_TO_BLK_NUM(fba_pos)) cc_ent = cc_ent->cc_chain; if (!SDBC_VALID_BITS(st_cblk_off, st_cblk_len, cc_ent)) goto need_io; DATA_LOG(SDF_RD, cc_ent, st_cblk_off, st_cblk_len); DTRACE_PROBE4(_sd_read_data1, uint64_t, (uint64_t)(BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)) + st_cblk_off), uint64_t, (uint64_t)st_cblk_len, char *, *(int64_t *)(cc_ent->cc_data + FBA_SIZE(st_cblk_off)), char *, *(int64_t *)(cc_ent->cc_data + FBA_SIZE(st_cblk_off + st_cblk_len) - 8)); fba_pos += st_cblk_len; fba_len -= st_cblk_len; cc_ent = cc_ent->cc_chain; while (fba_len > (nsc_size_t)end_cblk_len) { if (!FULLY_VALID(cc_ent)) goto need_io; DATA_LOG(SDF_RD, cc_ent, 0, BLK_FBAS); DTRACE_PROBE4(_sd_read_data2, uint64_t, (uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)), uint64_t, (uint64_t)BLK_FBAS, char *, *(int64_t *)(cc_ent->cc_data), char *, *(int64_t *)(cc_ent->cc_data + FBA_SIZE(BLK_FBAS) - 8)); fba_pos += BLK_FBAS; fba_len -= BLK_FBAS; cc_ent = cc_ent->cc_chain; } if (fba_len) { if (!SDBC_VALID_BITS(0, end_cblk_len, cc_ent)) goto need_io; DATA_LOG(SDF_RD, cc_ent, 0, end_cblk_len); DTRACE_PROBE4(_sd_read_data3, uint64_t, (uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)), uint64_t, (uint64_t)end_cblk_len, char *, *(int64_t *)(cc_ent->cc_data), char *, *(int64_t *)(cc_ent->cc_data + FBA_SIZE(end_cblk_len) - 8)); } CACHE_FBA_READ(handle->bh_cd, fba_orig_len); CACHE_READ_HIT; FBA_READ_IO_KSTATS(handle->bh_cd, FBA_SIZE(fba_orig_len)); ret = NSC_HIT; goto stats_exit; need_io: _SD_DISCONNECT_CALLBACK(handle); ret = _sd_doread(handle, cc_ent, fba_pos, fba_len, flag); stats_exit: KSTAT_RUNQ_EXIT(cd); out: return (ret); } /* * sdbc_doread_prefetch - read ahead one cache block * * ARGUMENTS: * cc_ent - cache entry * fba_pos - disk block number to read from * fba_len - length in fbas. * * RETURNS: * number of fbas, if any, that are to be read beyond (fba_pos + fba_len) * * USAGE: * if readahead is to be done allocate a cache block and place * on the cc_chain of cc_ent */ static int sdbc_doread_prefetch(_sd_cctl_t *cc_ent, nsc_off_t fba_pos, nsc_size_t fba_len) { nsc_off_t st_cblk = FBA_TO_BLK_NUM(fba_pos); nsc_off_t next_cblk = FBA_TO_BLK_NUM(fba_pos + BLK_FBAS); nsc_size_t filesize; int fba_count = 0; /* number of fbas to prefetch */ _sd_cctl_t *cc_ra; /* the read ahead cache entry */ int cd = CENTRY_CD(cc_ent); nsc_size_t vol_fill; filesize = _sd_cache_files[cd].cd_info->sh_filesize; vol_fill = filesize - (fba_pos + fba_len); /* readahead only for small reads */ if ((fba_len <= FBA_LEN(CACHE_BLOCK_SIZE)) && (fba_pos != 0) && (vol_fill > 0)) { /* * if prev block is in cache and next block is not, * then read ahead one block */ if (_sd_hash_search(cd, st_cblk - 1, _sd_htable)) { if (!_sd_hash_search(cd, next_cblk, _sd_htable)) { cc_ra = sdbc_centry_alloc_blks (cd, next_cblk, 1, ALLOC_NOWAIT); if (cc_ra) { /* if in cache don't readahead */ if (cc_ra->cc_aging_dm & HASH_ENTRY_DM) { ++sdbc_ra_hash; _sd_centry_release(cc_ra); } else { cc_ent->cc_chain = cc_ra; cc_ra->cc_chain = 0; fba_count = (vol_fill > (nsc_size_t)BLK_FBAS) ? BLK_FBAS : (int)vol_fill; /* * indicate implicit prefetch * and mark for release in * _sd_read_complete() */ cc_ra->cc_aging_dm |= (PREFETCH_BUF_I | PREFETCH_BUF_IR); } } else { ++sdbc_ra_none; } } } } return (fba_count); } /* * _sd_doread - Check if blocks in cache. If not completely true, do io. * * ARGUMENTS: * handle - handle allocated earlier on. * fba_pos - disk block number to read from. * fba_len - length in fbas. * flag - flag: (NSC_NOBLOCK for async io) * * RETURNS: * errno if return > 0 * NSC_DONE(from disk), or NSC_PENDING otherwise. * * Comments: * It initiates an io and either blocks waiting for the completion * or return NSC_PENDING, depending on whether the flag bit * NSC_NOBLOCK is reset or set. * */ static int _sd_doread(_sd_buf_handle_t *handle, _sd_cctl_t *cc_ent, nsc_off_t fba_pos, nsc_size_t fba_len, int flag) { int cd, err; nsc_size_t fba_orig_len; /* length in FBA's of the original request */ nsc_size_t file_len; /* length in bytes of io to be done */ sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */ sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */ sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */ int num_bdl; _sd_cctl_t *cc_temp; struct buf *bp; unsigned int want_bits; void (*fn)(blind_t, nsc_off_t, nsc_size_t, int); sdbc_cblk_fba_t end_cblk_fill; /* FBA's to fill to end of last block */ nsc_size_t vol_end_fill; /* # of FBA's to fill to end of the volume */ cd = HANDLE_CD(handle); SDTRACE(ST_ENTER|SDF_READ, cd, fba_len, fba_pos, flag, 0); ASSERT(cd >= 0); if (_sd_cache_files[cd].cd_info->sh_failed) { SDTRACE(ST_EXIT|SDF_READ, cd, fba_len, fba_pos, flag, EIO); return (EIO); } /* * adjust the position and length so that the entire cache * block is read in */ /* first, adjust to beginning of cache block */ fba_len += BLK_FBA_OFF(fba_pos); /* add start offset to length */ fba_pos &= ~BLK_FBA_MASK; /* move position back to start of block */ /* compute fill to end of cache block */ end_cblk_fill = (BLK_FBAS - 1) - ((fba_len - 1) % BLK_FBAS); vol_end_fill = _sd_cache_files[(cd)].cd_info->sh_filesize - (fba_pos + fba_len); /* fill to lesser of cache block or end of volume */ fba_len += ((nsc_size_t)end_cblk_fill < vol_end_fill) ? end_cblk_fill : vol_end_fill; DTRACE_PROBE2(_sd_doread_rfill, nsc_off_t, fba_pos, nsc_size_t, fba_len); /* for small reads do 1-block readahead if previous block is in cache */ if (sdbc_prefetch1) fba_len += sdbc_doread_prefetch(cc_ent, fba_pos, fba_len); fba_orig_len = fba_len; st_cblk_off = BLK_FBA_OFF(fba_pos); st_cblk_len = BLK_FBAS - st_cblk_off; if ((nsc_size_t)st_cblk_len >= fba_len) { end_cblk_len = 0; st_cblk_len = (sdbc_cblk_fba_t)fba_len; } else { end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len); } cc_temp = cc_ent; num_bdl = 0; while (cc_temp) { num_bdl += (SDBC_LOOKUP_IOCOUNT(CENTRY_DIRTY(cc_temp))); cc_temp = cc_temp->cc_chain; } bp = sd_alloc_iob(_sd_cache_files[cd].cd_crdev, fba_pos, num_bdl, B_READ); if (bp == NULL) { SDTRACE(ST_EXIT|SDF_READ, cd, fba_len, fba_pos, flag, E2BIG); return (E2BIG); } want_bits = SDBC_GET_BITS(st_cblk_off, st_cblk_len); if (want_bits & CENTRY_DIRTY(cc_ent)) _sd_ccent_rd(cc_ent, want_bits, bp); else { sd_add_fba(bp, &cc_ent->cc_addr, st_cblk_off, st_cblk_len); } file_len = FBA_SIZE(st_cblk_len); cc_ent = cc_ent->cc_chain; fba_len -= st_cblk_len; while (fba_len > (nsc_size_t)end_cblk_len) { if (CENTRY_DIRTY(cc_ent)) _sd_ccent_rd(cc_ent, (uint_t)BLK_FBA_BITS, bp); else { sd_add_fba(bp, &cc_ent->cc_addr, 0, BLK_FBAS); } file_len += CACHE_BLOCK_SIZE; cc_ent = cc_ent->cc_chain; fba_len -= BLK_FBAS; } if (fba_len) { want_bits = SDBC_GET_BITS(0, end_cblk_len); if (want_bits & CENTRY_DIRTY(cc_ent)) _sd_ccent_rd(cc_ent, want_bits, bp); else { sd_add_fba(bp, &cc_ent->cc_addr, 0, end_cblk_len); } file_len += FBA_SIZE(end_cblk_len); } CACHE_READ_MISS; FBA_READ_IO_KSTATS(cd, file_len); DISK_FBA_READ(cd, FBA_NUM(file_len)); fn = (handle->bh_flag & NSC_NOBLOCK) ? _sd_async_read_ea : NULL; err = sd_start_io(bp, _sd_cache_files[cd].cd_strategy, fn, handle); if (err != NSC_PENDING) { _sd_read_complete(handle, fba_pos, fba_orig_len, err); } SDTRACE(ST_EXIT|SDF_READ, cd, fba_orig_len, fba_pos, flag, err); return (err); } /* * _sd_read_complete - Do whatever is necessary after a read io is done. * * ARGUMENTS: * handle - handle allocated earlier on. * fba_pos - disk block number to read from. * fba_len - length in fbas. * error - error from io if any. * * RETURNS: * NONE. * * Comments: * This routine marks the cache blocks valid if the io completed * sucessfully. Called from the async end action as well as after * a synchrnous read completes. */ void _sd_read_complete(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len, int error) { sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */ sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */ sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */ nsc_size_t cur_fba_len; /* length in FBA's */ _sd_cctl_t *cc_iocent; _sd_cctl_t *first_iocent; /* first buffer when processing prefetch */ cc_iocent = handle->bh_centry; if ((handle->bh_error = error) == 0) { while (CENTRY_BLK(cc_iocent) != FBA_TO_BLK_NUM(fba_pos)) cc_iocent = cc_iocent->cc_chain; cur_fba_len = fba_len; st_cblk_off = BLK_FBA_OFF(fba_pos); st_cblk_len = BLK_FBAS - st_cblk_off; if ((nsc_size_t)st_cblk_len >= fba_len) { end_cblk_len = 0; st_cblk_len = (sdbc_cblk_fba_t)fba_len; } else { end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len); } SDBC_SET_VALID_BITS(st_cblk_off, st_cblk_len, cc_iocent); DATA_LOG(SDF_RDIO, cc_iocent, st_cblk_off, st_cblk_len); DTRACE_PROBE4(_sd_read_complete_data1, uint64_t, (uint64_t) BLK_TO_FBA_NUM(CENTRY_BLK(cc_iocent)) + st_cblk_off, int, st_cblk_len, char *, *(int64_t *)(cc_iocent->cc_data + FBA_SIZE(st_cblk_off)), char *, *(int64_t *)(cc_iocent->cc_data + FBA_SIZE(st_cblk_off + st_cblk_len) - 8)); first_iocent = cc_iocent; cc_iocent = cc_iocent->cc_chain; cur_fba_len -= st_cblk_len; while (cur_fba_len > (nsc_size_t)end_cblk_len) { SET_FULLY_VALID(cc_iocent); DATA_LOG(SDF_RDIO, cc_iocent, 0, BLK_FBAS); DTRACE_PROBE4(_sd_read_complete_data2, uint64_t, (uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(cc_iocent)), int, BLK_FBAS, char *, *(int64_t *)(cc_iocent->cc_data), char *, *(int64_t *)(cc_iocent->cc_data + FBA_SIZE(BLK_FBAS) - 8)); /* * 4755485 release implicit prefetch buffers * * the cc_chain of the first buffer must NULL'd * else _sd_free_buf() will do a double free when * it traverses the chain. * * if a buffer has been marked PREFETCH_BUF_IR then * it is guaranteed that * 1. it is the second in a chain of two. * 2. cur_fba_len is BLK_FBAS. * 3. end_cblk_len is zero. * * because of 1 (and 2) above, we can safely exit the * while loop via the break statement without * executing the last two statements. the break * statement is necessary because it would be unsafe * to access cc_iocent which could be reallocated * immediately after the _sd_centry_release(). */ if (cc_iocent->cc_aging_dm & PREFETCH_BUF_IR) { cc_iocent->cc_aging_dm &= ~(PREFETCH_BUF_IR); _sd_centry_release(cc_iocent); first_iocent->cc_chain = NULL; break; } cc_iocent = cc_iocent->cc_chain; cur_fba_len -= BLK_FBAS; } if (end_cblk_len) { SDBC_SET_VALID_BITS(0, end_cblk_len, cc_iocent); DATA_LOG(SDF_RDIO, cc_iocent, 0, end_cblk_len); DTRACE_PROBE4(_sd_read_complete_data3, uint64_t, (uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(cc_iocent)), int, end_cblk_len, char *, *(int64_t *)(cc_iocent->cc_data), char *, *(int64_t *)(cc_iocent->cc_data + FBA_SIZE(end_cblk_len) - 8)); } } } /* * _sd_async_read_ea - End action for async reads. * * ARGUMENTS: * xhandle - handle allocated earlier on (cast to blind_t). * fba_pos - disk block number read from. * fba_len - length in fbas. * error - error from io if any. * * RETURNS: * NONE. * * Comments: * This routine is called at interrupt level when the io is done. * This is called only when read is asynchronous (NSC_NOBLOCK) */ static void _sd_async_read_ea(blind_t xhandle, nsc_off_t fba_pos, nsc_size_t fba_len, int error) { _sd_buf_handle_t *handle = xhandle; int cd; if (error) { cd = HANDLE_CD(handle); ASSERT(cd >= 0); _sd_cache_files[cd].cd_info->sh_failed = 1; } SDTRACE(ST_ENTER|SDF_READ_EA, HANDLE_CD(handle), handle->bh_fba_len, handle->bh_fba_pos, 0, error); _sd_read_complete(handle, fba_pos, fba_len, error); #if defined(_SD_DEBUG_PATTERN) check_buf_consistency(handle, "rd"); #endif SDTRACE(ST_EXIT|SDF_READ_EA, HANDLE_CD(handle), handle->bh_fba_len, handle->bh_fba_pos, 0, 0); _SD_READ_CALLBACK(handle); } /* * _sd_async_write_ea - End action for async writes. * * ARGUMENTS: * xhandle - handle allocated earlier on. (cast to blind_t) * fba_pos - disk block number written to. * fba_len - length in fbas. * error - error from io if any. * * RETURNS: * NONE. * * Comments: * This routine is called at interrupt level when the write io is done. * This is called only when we are in write-through mode and the write * call indicated asynchronous callback. (NSC_NOBLOCK) */ /* ARGSUSED */ static void _sd_async_write_ea(blind_t xhandle, nsc_off_t fba_pos, nsc_size_t fba_len, int error) { _sd_buf_handle_t *handle = xhandle; handle->bh_error = error; if (error) _sd_cache_files[HANDLE_CD(handle)].cd_info->sh_failed = 1; _SD_WRITE_CALLBACK(handle); } /* * update_dirty - set dirty bits in cache block which is already dirty * cc_inuse is held, need cc_lock to avoid race with _sd_process_pending * must check for I/O in-progress and set PEND_DIRTY. * return previous dirty bits * [if set _sd_process_pending will re-issue] */ static _sd_bitmap_t update_dirty(_sd_cctl_t *cc_ent, sdbc_cblk_fba_t st_off, sdbc_cblk_fba_t st_len) { _sd_bitmap_t old; /* was FAST */ mutex_enter(&cc_ent->cc_lock); old = CENTRY_DIRTY(cc_ent); if (old) { /* * If we are writing to an FBA that is still marked dirty, * record a write cancellation. */ if (old & SDBC_GET_BITS(st_off, st_len)) { CACHE_WRITE_CANCELLATION(CENTRY_CD(cc_ent)); } /* This is a write to a block that was already dirty */ SDBC_SET_DIRTY(st_off, st_len, cc_ent); sd_serialize(); if (CENTRY_IO_INPROGRESS(cc_ent)) cc_ent->cc_flag |= CC_PEND_DIRTY; } /* was FAST */ mutex_exit(&cc_ent->cc_lock); return (old); } /* * _sd_write - Interface call to commit part of handle. * * ARGUMENTS: * handle - handle allocated earlier o. * fba_pos - disk block number to write to. * fba_len - length in fbas. * flag - (NSC_NOBLOCK | NSC_WRTHRU) * * RETURNS: * errno if return > 0 * NSC_HIT (in cache), NSC_DONE (to disk) or NSC_PENDING otherwise. * * Comments: * This routine checks validity of the handle and then calls the * sync-write function if this write is determined to be write-through. * Else, it reflects the data to the write blocks on the mirror node, * (allocated in alloc_buf). If the cache block is not dirty, it is * marked dirty and queued up for io processing later on. * If parts are already dirty but io is not in progress yet, it is * marked dirty and left alone (it is already in the queue) * If parts are already dirty but io is in progress, it is marked * dirty and also a flag is set indicating that this buffer should * be reprocessed after the io-end-action. * Attempt is made to coalesce multiple writes into a single list * for io processing later on. * * Issuing of writes may be delayed until the handle is released; * _sd_queue_write() sets NSC_QUEUE, indicating that dirty bits * and reflection to mirror have already been done, just queue I/O. */ int _sd_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len, int flag) { int cd = HANDLE_CD(handle); int num_queued, ret, queue_only, store_only; sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */ sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */ sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */ nsc_size_t cur_fba_len; /* position in disk blocks */ _sd_cctl_t *cc_ent = NULL; _sd_cctl_t *cur_chain = NULL, *dirty_next = NULL; if (_sdbc_shutdown_in_progress) { ret = EIO; goto out; } if (!_SD_HANDLE_ACTIVE(handle)) { SDALERT(SDF_WRITE, SDT_INV_CD, 0, SDT_INV_BL, handle->bh_flag, 0); ret = EINVAL; goto out; } #if !defined(_SD_NOCHECKS) ASSERT_HANDLE_LIMITS(handle, fba_pos, fba_len); if ((handle->bh_flag & NSC_WRBUF) == 0) { ret = EINVAL; goto out; } #endif if (fba_len == 0) { ret = NSC_DONE; goto out; } /* * store_only: don't queue this I/O yet * queue_only: queue I/O to disk, don't store in mirror node */ if (flag & NSC_QUEUE) queue_only = 1, store_only = 0; else if (_SD_DELAY_QUEUE && (fba_len != handle->bh_fba_len)) queue_only = 0, store_only = 1; else queue_only = store_only = 0; if (!queue_only && _SD_FORCE_DISCONNECT(fba_len)) _SD_DISCONNECT_CALLBACK(handle); if (_sd_cache_files[cd].cd_info->sh_failed) { ret = EIO; goto out; } KSTAT_RUNQ_ENTER(cd); SDTRACE(ST_ENTER|SDF_WRITE, cd, fba_len, fba_pos, flag, 0); #if defined(_SD_DEBUG_PATTERN) check_buf_consistency(handle, "wr"); #endif cc_ent = handle->bh_centry; while (CENTRY_BLK(cc_ent) != FBA_TO_BLK_NUM(fba_pos)) cc_ent = cc_ent->cc_chain; if (((handle->bh_flag | flag) & _SD_WRTHRU_MASK) || (!queue_only && _sd_remote_store(cc_ent, fba_pos, fba_len))) { flag |= NSC_WRTHRU; ret = _sd_sync_write(handle, fba_pos, fba_len, flag); goto stats_exit; } if (store_only) /* enqueue in _sd_free_buf() */ handle->bh_flag |= NSC_QUEUE; cur_fba_len = fba_len; st_cblk_off = BLK_FBA_OFF(fba_pos); st_cblk_len = BLK_FBAS - st_cblk_off; if ((nsc_size_t)st_cblk_len >= fba_len) { end_cblk_len = 0; st_cblk_len = (sdbc_cblk_fba_t)fba_len; } else { end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len); } if (CENTRY_DIRTY(cc_ent) && update_dirty(cc_ent, st_cblk_off, st_cblk_len)) goto loop1; if (store_only) { SDBC_SET_TOFLUSH(st_cblk_off, st_cblk_len, cc_ent); goto loop1; } SDBC_SET_DIRTY(st_cblk_off, st_cblk_len, cc_ent); cur_chain = dirty_next = cc_ent; num_queued = 1; loop1: DATA_LOG(SDF_WR, cc_ent, st_cblk_off, st_cblk_len); DTRACE_PROBE4(_sd_write_data1, uint64_t, (uint64_t) (BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)) + st_cblk_off), int, st_cblk_len, char *, *(int64_t *)(cc_ent->cc_data + FBA_SIZE(st_cblk_off)), char *, *(int64_t *)(cc_ent->cc_data + FBA_SIZE(st_cblk_off+ st_cblk_len) - 8)); cur_fba_len -= st_cblk_len; cc_ent = cc_ent->cc_chain; while (cur_fba_len > (nsc_size_t)end_cblk_len) { if (CENTRY_DIRTY(cc_ent) && update_dirty(cc_ent, 0, BLK_FBAS)) { if (cur_chain) { _sd_enqueue_dirty(cd, cur_chain, dirty_next, num_queued); cur_chain = dirty_next = NULL; } goto loop2; } if (store_only) { SDBC_SET_TOFLUSH(0, BLK_FBAS, cc_ent); goto loop2; } SDBC_SET_DIRTY(0, BLK_FBAS, cc_ent); if (dirty_next) { dirty_next->cc_dirty_next = cc_ent; dirty_next = cc_ent; num_queued++; } else { cur_chain = dirty_next = cc_ent; num_queued = 1; } loop2: DATA_LOG(SDF_WR, cc_ent, 0, BLK_FBAS); DTRACE_PROBE4(_sd_write_data2, uint64_t, (uint64_t)(BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent))), int, BLK_FBAS, char *, *(int64_t *)(cc_ent->cc_data), char *, *(int64_t *)(cc_ent->cc_data + FBA_SIZE(BLK_FBAS) - 8)); cc_ent = cc_ent->cc_chain; cur_fba_len -= BLK_FBAS; } #if defined(_SD_DEBUG) if (cur_fba_len != end_cblk_len) cmn_err(CE_WARN, "!fba_len %" NSC_SZFMT " end_cblk_len %d in " "_sd_write", cur_fba_len, end_cblk_len); #endif if (cur_fba_len) { if (CENTRY_DIRTY(cc_ent) && update_dirty(cc_ent, 0, end_cblk_len)) { if (cur_chain) { _sd_enqueue_dirty(cd, cur_chain, dirty_next, num_queued); cur_chain = dirty_next = NULL; } goto loop3; } if (store_only) { SDBC_SET_TOFLUSH(0, end_cblk_len, cc_ent); goto loop3; } SDBC_SET_DIRTY(0, end_cblk_len, cc_ent); if (dirty_next) { dirty_next->cc_dirty_next = cc_ent; dirty_next = cc_ent; num_queued++; } else { cur_chain = dirty_next = cc_ent; num_queued = 1; } } loop3: if (cur_fba_len) { DATA_LOG(SDF_WR, cc_ent, 0, end_cblk_len); DTRACE_PROBE4(_sd_write_data3, uint64_t, (uint64_t)(BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent))), int, end_cblk_len, char *, *(int64_t *)(cc_ent->cc_data), char *, *(int64_t *)(cc_ent->cc_data + FBA_SIZE(end_cblk_len) - 8)); } if (!store_only && cur_chain) { _sd_enqueue_dirty(cd, cur_chain, dirty_next, num_queued); } if (!queue_only) { CACHE_FBA_WRITE(cd, fba_len); CACHE_WRITE_HIT; FBA_WRITE_IO_KSTATS(cd, FBA_SIZE(fba_len)); } ret = NSC_HIT; stats_exit: SDTRACE(ST_EXIT|SDF_WRITE, cd, fba_len, fba_pos, flag, ret); KSTAT_RUNQ_EXIT(cd); out: return (ret); } /* * _sd_queue_write(handle, fba_pos, fba_len): Queues delayed writes for * flushing * * ARGUMENTS: handle - handle allocated with NSC_WRBUF * fba_pos - starting fba pos from _sd_alloc_buf() * fba_len - fba len from _sd_alloc_buf() * * USAGE : Called if _SD_DELAY_QUEUE is set. Finds all blocks in the * handle marked for flushing and queues them to be written in * optimized (i.e. sequential) order */ static void _sd_queue_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len) { nsc_off_t fba_end; sdbc_cblk_fba_t sblk, len, dirty; _sd_cctl_t *cc_ent; nsc_off_t flush_pos; int flush_pos_valid = 0; nsc_size_t flush_len = 0; cc_ent = handle->bh_centry; fba_end = fba_pos + fba_len; fba_pos = BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)); /* 1st block */ while (fba_pos < fba_end) { dirty = cc_ent->cc_toflush; cc_ent->cc_toflush = 0; /* * Full block */ if (_SD_BMAP_ISFULL(dirty)) { if (flush_pos_valid == 0) { flush_pos_valid = 1; flush_pos = fba_pos; } flush_len += BLK_FBAS; } /* * Partial block */ else while (dirty) { sblk = SDBC_LOOKUP_STPOS(dirty); len = SDBC_LOOKUP_LEN(dirty); SDBC_LOOKUP_MODIFY(dirty); if (sblk && flush_pos_valid) { (void) _sd_write(handle, flush_pos, flush_len, NSC_QUEUE); flush_pos_valid = 0; flush_len = 0; } if (flush_pos_valid == 0) { flush_pos_valid = 1; flush_pos = fba_pos + sblk; } flush_len += len; } fba_pos += BLK_FBAS; cc_ent = cc_ent->cc_chain; /* * If we find a gap, write out what we've got */ if (flush_pos_valid && (flush_pos + flush_len) != fba_pos) { (void) _sd_write(handle, flush_pos, flush_len, NSC_QUEUE); flush_pos_valid = 0; flush_len = 0; } } if (flush_pos_valid) (void) _sd_write(handle, flush_pos, flush_len, NSC_QUEUE); } static int _sd_remote_store(_sd_cctl_t *cc_ent, nsc_off_t fba_pos, nsc_size_t fba_len) { sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */ sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */ sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */ ss_resource_t *ss_res; if (_sd_nodes_configured <= 2 && _sd_is_mirror_down()) return (0); st_cblk_off = BLK_FBA_OFF(fba_pos); st_cblk_len = BLK_FBAS - st_cblk_off; if ((nsc_size_t)st_cblk_len >= fba_len) { end_cblk_len = 0; st_cblk_len = (sdbc_cblk_fba_t)fba_len; } else { end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len); } fba_len -= st_cblk_len; ss_res = cc_ent->cc_write->sc_res; if (SSOP_WRITE_CBLOCK(sdbc_safestore, ss_res, cc_ent->cc_data + FBA_SIZE(st_cblk_off), FBA_SIZE(st_cblk_len), FBA_SIZE(st_cblk_off))) { cmn_err(CE_WARN, "!sdbc(_sd_write) safe store failed. Going synchronous"); SDTRACE(SDF_REFLECT, CENTRY_CD(cc_ent), fba_len, fba_pos, 0, -1); return (-1); } cc_ent = cc_ent->cc_chain; while (fba_len > (nsc_size_t)end_cblk_len) { fba_len -= BLK_FBAS; if (SSOP_WRITE_CBLOCK(sdbc_safestore, ss_res, cc_ent->cc_data, CACHE_BLOCK_SIZE, 0)) { cmn_err(CE_WARN, "!sdbc(_sd_write) safe store failed. " "Going synchronous"); SDTRACE(SDF_REFLECT, CENTRY_CD(cc_ent), fba_len, fba_pos, 0, -1); return (-1); } cc_ent = cc_ent->cc_chain; } /* end while */ if (fba_len) { if (SSOP_WRITE_CBLOCK(sdbc_safestore, ss_res, cc_ent->cc_data, FBA_SIZE(end_cblk_len), 0)) { cmn_err(CE_WARN, "!sdbc(_sd_write) nvmem dma failed. " "Going synchronous"); SDTRACE(SDF_REFLECT, CENTRY_CD(cc_ent), fba_len, fba_pos, 0, -1); return (-1); } } return (0); } /* * _sd_sync_write2 - Write-through function. * * ARGUMENTS: * wr_handle - handle into which to write the data. * wr_st_pos - starting FBA position in wr_handle. * fba_len - length in fbas. * flag - NSC_NOBLOCK for async io. * rd_handle - handle from which to read the data, or NULL. * rd_st_pos - starting FBA position in rd_handle. * * RETURNS: * errno if return > 0 * NSC_DONE or NSC_PENDING otherwise. * * Comments: * This routine initiates io of the indicated portion. It returns * synchronously after io is completed if NSC_NOBLOCK is not set. * Else NSC_PENDING is returned with a subsequent write callback on * io completion. * * See _sd_copy_direct() for usage when * (wr_handle != rd_handle && rd_handle != NULL) */ static int _sd_sync_write2(_sd_buf_handle_t *wr_handle, nsc_off_t wr_st_pos, nsc_size_t fba_len, int flag, _sd_buf_handle_t *rd_handle, nsc_off_t rd_st_pos) { void (*fn)(blind_t, nsc_off_t, nsc_size_t, int); _sd_cctl_t *wr_ent, *rd_ent; nsc_size_t this_len; nsc_off_t rd_pos, wr_pos; nsc_size_t log_bytes; int cd = HANDLE_CD(wr_handle); int err; uint_t dirty; struct buf *bp; LINTUSED(flag); _SD_DISCONNECT_CALLBACK(wr_handle); if (rd_handle == NULL) { rd_handle = wr_handle; rd_st_pos = wr_st_pos; } wr_ent = wr_handle->bh_centry; while (CENTRY_BLK(wr_ent) != FBA_TO_BLK_NUM(wr_st_pos)) wr_ent = wr_ent->cc_chain; rd_ent = rd_handle->bh_centry; while (CENTRY_BLK(rd_ent) != FBA_TO_BLK_NUM(rd_st_pos)) rd_ent = rd_ent->cc_chain; bp = sd_alloc_iob(_sd_cache_files[cd].cd_crdev, wr_st_pos, FBA_TO_BLK_LEN(fba_len) + 2, B_WRITE); if (bp == NULL) return (E2BIG); wr_pos = BLK_FBA_OFF(wr_st_pos); rd_pos = BLK_FBA_OFF(rd_st_pos); log_bytes = 0; do { this_len = min((BLK_FBAS - rd_pos), (BLK_FBAS - wr_pos)); if (this_len > fba_len) this_len = fba_len; /* * clear dirty bits in the write handle. */ if (CENTRY_DIRTY(wr_ent)) { mutex_enter(&wr_ent->cc_lock); if (CENTRY_DIRTY(wr_ent)) { if (this_len == (nsc_size_t)BLK_FBAS || rd_handle != wr_handle) { /* * optimization for when we have a * full cache block, or are doing * copy_direct (see below). */ wr_ent->cc_write->sc_dirty = 0; } else { dirty = wr_ent->cc_write->sc_dirty; dirty &= ~(SDBC_GET_BITS( wr_pos, this_len)); wr_ent->cc_write->sc_dirty = dirty; } SSOP_SETCENTRY(sdbc_safestore, wr_ent->cc_write); } mutex_exit(&wr_ent->cc_lock); } /* * update valid bits in the write handle. */ if (rd_handle == wr_handle) { if (this_len == (nsc_size_t)BLK_FBAS) { SET_FULLY_VALID(wr_ent); } else { SDBC_SET_VALID_BITS(wr_pos, this_len, wr_ent); } } else { /* * doing copy_direct, so mark the write handle * as invalid since the data is on disk, but not * in cache. */ wr_ent->cc_valid = 0; } DATA_LOG(SDF_WRSYNC, rd_ent, rd_pos, this_len); DTRACE_PROBE4(_sd_sync_write2_data, uint64_t, (uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(rd_ent)) + rd_pos, uint64_t, (uint64_t)this_len, char *, *(int64_t *)(rd_ent->cc_data + FBA_SIZE(rd_pos)), char *, *(int64_t *)(rd_ent->cc_data + FBA_SIZE(rd_pos + this_len) - 8)); sd_add_fba(bp, &rd_ent->cc_addr, rd_pos, this_len); log_bytes += FBA_SIZE(this_len); fba_len -= this_len; wr_pos += this_len; if (wr_pos >= (nsc_size_t)BLK_FBAS) { wr_ent = wr_ent->cc_chain; wr_pos = 0; } rd_pos += this_len; if (rd_pos >= (nsc_size_t)BLK_FBAS) { rd_ent = rd_ent->cc_chain; rd_pos = 0; } } while (fba_len > 0); DISK_FBA_WRITE(cd, FBA_NUM(log_bytes)); CACHE_WRITE_MISS; FBA_WRITE_IO_KSTATS(cd, log_bytes); fn = (wr_handle->bh_flag & NSC_NOBLOCK) ? _sd_async_write_ea : NULL; err = sd_start_io(bp, _sd_cache_files[cd].cd_strategy, fn, wr_handle); if (err != NSC_PENDING) { DATA_LOG_CHAIN(SDF_WRSYEA, wr_handle->bh_centry, wr_st_pos, FBA_NUM(log_bytes)); } return (err); } static int _sd_sync_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len, int flag) { return (_sd_sync_write2(handle, fba_pos, fba_len, flag, NULL, 0)); } /* * _sd_zero - Interface call to zero out a portion of cache blocks. * * ARGUMENTS: * handle - handle allocated earlier on. * fba_pos - disk block number to zero from. * fba_len - length in fbas. * flag - NSC_NOBLOCK for async io. * * RETURNS: * errno if return > 0 * NSC_DONE or NSC_PENDING otherwise. * * Comments: * This routine zeroes out the indicated portion of the cache blocks * and commits the data to disk. * (See write for more details on the commit) */ int _sd_zero(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len, int flag) { int cd; sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */ sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */ sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */ nsc_size_t cur_fba_len; /* position in disk blocks */ int ret; _sd_cctl_t *cc_ent; if (_sdbc_shutdown_in_progress) { DTRACE_PROBE(shutdown); return (EIO); } if (!_SD_HANDLE_ACTIVE(handle)) { cmn_err(CE_WARN, "!sdbc(_sd_zero) handle %p not active", (void *)handle); DTRACE_PROBE1(handle_active, int, handle->bh_flag); return (EINVAL); } ASSERT_HANDLE_LIMITS(handle, fba_pos, fba_len); if ((handle->bh_flag & NSC_WRBUF) == 0) { DTRACE_PROBE1(handle_write, int, handle->bh_flag); return (EINVAL); } if (fba_len == 0) { DTRACE_PROBE(zero_len); return (NSC_DONE); } if (_SD_FORCE_DISCONNECT(fba_len)) _SD_DISCONNECT_CALLBACK(handle); cd = HANDLE_CD(handle); SDTRACE(ST_ENTER|SDF_ZERO, cd, fba_len, fba_pos, flag, 0); cc_ent = handle->bh_centry; while (CENTRY_BLK(cc_ent) != FBA_TO_BLK_NUM(fba_pos)) cc_ent = cc_ent->cc_chain; cur_fba_len = fba_len; st_cblk_off = BLK_FBA_OFF(fba_pos); st_cblk_len = BLK_FBAS - st_cblk_off; if ((nsc_size_t)st_cblk_len >= fba_len) { end_cblk_len = 0; st_cblk_len = (sdbc_cblk_fba_t)fba_len; } else { end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len); } cur_fba_len -= st_cblk_len; bzero(cc_ent->cc_data + FBA_SIZE(st_cblk_off), FBA_SIZE(st_cblk_len)); cc_ent = cc_ent->cc_chain; while (cur_fba_len > (nsc_size_t)end_cblk_len) { cur_fba_len -= BLK_FBAS; bzero(cc_ent->cc_data, CACHE_BLOCK_SIZE); cc_ent = cc_ent->cc_chain; } if (cur_fba_len) { bzero(cc_ent->cc_data, FBA_SIZE(cur_fba_len)); } ret = _sd_write(handle, fba_pos, fba_len, flag); SDTRACE(ST_EXIT|SDF_ZERO, cd, fba_len, fba_pos, flag, ret); return (ret); } /* * _sd_copy - Copies portions of 2 handles. * * ARGUMENTS: * handle1 - handle allocated earlier on. * handle2 - handle allocated earlier on. * fba_pos1 - disk block number to read from. * fba_pos2 - disk block number to write to. * fba_len - length in fbas. * * RETURNS: * errno if return > 0 * NSC_DONE otherwise. * * Comments: * This routine copies the 2 handles. * WARNING: this could put the cache blocks in the destination handle * in an inconsistent state. (the blocks could be valid in cache, * but the copy makes the cache different from disk) * */ int _sd_copy(_sd_buf_handle_t *handle1, _sd_buf_handle_t *handle2, nsc_off_t fba_pos1, nsc_off_t fba_pos2, nsc_size_t fba_len) { sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */ sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */ sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */ nsc_off_t off1, off2; /* offsets in FBA's into the disk */ nsc_size_t cur_fba_len; /* position in disk blocks */ _sd_cctl_t *cc_ent1, *cc_ent2; if (_sdbc_shutdown_in_progress) { DTRACE_PROBE(shutdown); return (EIO); } if (!_SD_HANDLE_ACTIVE(handle1) || !_SD_HANDLE_ACTIVE(handle2)) { cmn_err(CE_WARN, "!sdbc(_sd_copy) handle %p or %p not active", (void *)handle1, (void *)handle2); DTRACE_PROBE2(handle_active1, int, handle1->bh_flag, int, handle2->bh_flag); return (EINVAL); } ASSERT_HANDLE_LIMITS(handle1, fba_pos1, fba_len); ASSERT_HANDLE_LIMITS(handle2, fba_pos2, fba_len); cc_ent1 = handle1->bh_centry; while (CENTRY_BLK(cc_ent1) != FBA_TO_BLK_NUM(fba_pos1)) cc_ent1 = cc_ent1->cc_chain; cc_ent2 = handle2->bh_centry; while (CENTRY_BLK(cc_ent2) != FBA_TO_BLK_NUM(fba_pos2)) cc_ent2 = cc_ent2->cc_chain; if (BLK_FBA_OFF(fba_pos1) != BLK_FBA_OFF(fba_pos2)) { /* Different offsets, do it slowly (per fba) */ while (fba_len) { off1 = FBA_SIZE(BLK_FBA_OFF(fba_pos1)); off2 = FBA_SIZE(BLK_FBA_OFF(fba_pos2)); bcopy(cc_ent1->cc_data+off1, cc_ent2->cc_data+off2, FBA_SIZE(1)); fba_pos1++; fba_pos2++; fba_len--; if (FBA_TO_BLK_NUM(fba_pos1) != CENTRY_BLK(cc_ent1)) cc_ent1 = cc_ent1->cc_chain; if (FBA_TO_BLK_NUM(fba_pos2) != CENTRY_BLK(cc_ent2)) cc_ent2 = cc_ent2->cc_chain; } DTRACE_PROBE(_sd_copy_end); return (NSC_DONE); } cur_fba_len = fba_len; st_cblk_off = BLK_FBA_OFF(fba_pos1); st_cblk_len = BLK_FBAS - st_cblk_off; if ((nsc_size_t)st_cblk_len >= fba_len) { end_cblk_len = 0; st_cblk_len = (sdbc_cblk_fba_t)fba_len; } else { end_cblk_len = BLK_FBA_OFF(fba_pos1 + fba_len); } bcopy(cc_ent1->cc_data + FBA_SIZE(st_cblk_off), cc_ent2->cc_data + FBA_SIZE(st_cblk_off), FBA_SIZE(st_cblk_len)); cur_fba_len -= st_cblk_len; cc_ent1 = cc_ent1->cc_chain; cc_ent2 = cc_ent2->cc_chain; while (cur_fba_len > (nsc_size_t)end_cblk_len) { bcopy(cc_ent1->cc_data, cc_ent2->cc_data, CACHE_BLOCK_SIZE); cc_ent1 = cc_ent1->cc_chain; cc_ent2 = cc_ent2->cc_chain; cur_fba_len -= BLK_FBAS; } if (cur_fba_len) { bcopy(cc_ent1->cc_data, cc_ent2->cc_data, FBA_SIZE(end_cblk_len)); } return (NSC_DONE); } /* * _sd_copy_direct - Copies data from one handle direct to another disk. * * ARGUMENTS: * handle1 - handle to read from * handle2 - handle to write to * fba_pos1 - disk block number to read from. * fba_pos2 - disk block number to write to. * fba_len - length in fbas. * * RETURNS: * errno if return > 0 * NSC_DONE otherwise. * * Comments: * This routine copies data from handle1 directly (sync write) * onto the disk pointed to by handle2. The handle2 is then * invalidated since the data it contains is now stale compared to * the disk. */ static int _sd_copy_direct(_sd_buf_handle_t *handle1, _sd_buf_handle_t *handle2, nsc_off_t fba_pos1, nsc_off_t fba_pos2, nsc_size_t fba_len) { int rc; if (_sdbc_shutdown_in_progress) { DTRACE_PROBE(shutdown); return (EIO); } if (!_SD_HANDLE_ACTIVE(handle1) || !_SD_HANDLE_ACTIVE(handle2)) { cmn_err(CE_WARN, "!sdbc(_sd_copy_direct) handle %p or %p not active", (void *)handle1, (void *)handle2); DTRACE_PROBE2(handle_active2, int, handle1->bh_flag, int, handle2->bh_flag); return (EINVAL); } ASSERT_HANDLE_LIMITS(handle1, fba_pos1, fba_len); ASSERT_HANDLE_LIMITS(handle2, fba_pos2, fba_len); if ((handle2->bh_flag & NSC_WRITE) == 0) { cmn_err(CE_WARN, "!sdbc(_sd_copy_direct) handle2 %p is not writeable", (void *)handle2); DTRACE_PROBE1(handle2_write, int, handle2->bh_flag); return (EINVAL); } rc = _sd_sync_write2(handle2, fba_pos2, fba_len, 0, handle1, fba_pos1); return (rc); } /* * _sd_enqueue_dirty - Enqueue a list of dirty buffers. * * ARGUMENTS: * cd - cache descriptor. * chain - pointer to list. * cc_last - last entry in the chain. * numq - number of entries in the list. * * RETURNS: * NONE. * * Comments: * This routine queues up the dirty blocks for io processing. * It uses the cc_last to try to coalesce multiple lists into a * single list, if consecutive writes are sequential in nature. */ void _sd_enqueue_dirty(int cd, _sd_cctl_t *chain, _sd_cctl_t *cc_last, int numq) { _sd_cd_info_t *cdi; _sd_cctl_t *last_ent; int start_write = 0, maxq = SGIO_MAX; ASSERT(cd >= 0); cdi = &(_sd_cache_files[cd]); #if defined(_SD_DEBUG) if (chain->cc_dirty_link) cmn_err(CE_WARN, "!dirty_link set in enq %x fl %x", chain->cc_dirty_link, chain->cc_flag); #endif /* was FAST */ mutex_enter(&(cdi->cd_lock)); cdi->cd_info->sh_numdirty += numq; if (cc_last == NULL) numq = 0; if (cdi->cd_dirty_head == NULL) { cdi->cd_dirty_head = cdi->cd_dirty_tail = chain; cdi->cd_last_ent = cc_last; cdi->cd_lastchain_ptr = chain; cdi->cd_lastchain = numq; } else { if ((cc_last) && (last_ent = cdi->cd_last_ent) && (CENTRY_BLK(chain) == (CENTRY_BLK(last_ent)+1)) && (SDBC_DIRTY_NEIGHBORS(last_ent, chain)) && (cdi->cd_lastchain + numq < maxq)) { cdi->cd_last_ent->cc_dirty_next = chain; cdi->cd_last_ent = cc_last; cdi->cd_lastchain += numq; } else { cdi->cd_dirty_tail->cc_dirty_link = chain; cdi->cd_dirty_tail = chain; cdi->cd_last_ent = cc_last; cdi->cd_lastchain_ptr = chain; cdi->cd_lastchain = numq; start_write = 1; } } /* was FAST */ mutex_exit(&(cdi->cd_lock)); if (start_write) (void) _SD_CD_WRITER(cd); } /* * _sd_enqueue_dirty_chain - Enqueue a chain of a list of dirty buffers. * * ARGUMENTS: * cd - cache descriptor. * chain_first - first list in this chain. * chain_last - last list in this chain. * numq - number of entries being queue (total of all lists) * * RETURNS: * NONE. * * Comments: * This routine is called from the processing after io completions. * If the buffers are still dirty, they are queued up in one shot. */ void _sd_enqueue_dirty_chain(int cd, _sd_cctl_t *chain_first, _sd_cctl_t *chain_last, int numq) { _sd_cd_info_t *cdi; ASSERT(cd >= 0); cdi = &(_sd_cache_files[cd]); if (chain_last->cc_dirty_link) cmn_err(CE_PANIC, "!_sd_enqueue_dirty_chain: chain_last %p dirty_link %p", (void *)chain_last, (void *)chain_last->cc_dirty_link); /* was FAST */ mutex_enter(&(cdi->cd_lock)); cdi->cd_last_ent = NULL; cdi->cd_lastchain_ptr = NULL; cdi->cd_lastchain = 0; cdi->cd_info->sh_numdirty += numq; if (cdi->cd_dirty_head == NULL) { cdi->cd_dirty_head = chain_first; cdi->cd_dirty_tail = chain_last; } else { cdi->cd_dirty_tail->cc_dirty_link = chain_first; cdi->cd_dirty_tail = chain_last; } /* was FAST */ mutex_exit(&(cdi->cd_lock)); } #ifndef _MULTI_DATAMODEL /* ARGSUSED */ #endif static int convert_stats(_sd_stats32_t *uptr) /* * Convert the 64 bit statistic structure to 32bit version. * Possibly losing information when cache is > 4gb. Ha! * * NOTE: this code isn't really MT ready since the copied to struct * is static. However the race is pretty benign and isn't a whole * lot worse than the vanilla version which copies data to user * space from kernel structures that can be changing under it too. * We can't use a local stack structure since the data size is * 70k or so and kernel stacks are tiny (8k). */ { #ifndef _MULTI_DATAMODEL return (SDBC_EMODELCONVERT); #else int rc = 0; /* * This could be done in less code with bcopy type operations * but this is simpler to follow and easier to change if * the structures change. */ _sd_cache_stats32->net_dirty = _sd_cache_stats->net_dirty; _sd_cache_stats32->net_pending = _sd_cache_stats->net_pending; _sd_cache_stats32->net_free = _sd_cache_stats->net_free; _sd_cache_stats32->st_count = _sd_cache_stats->st_count; _sd_cache_stats32->st_loc_count = _sd_cache_stats->st_loc_count; _sd_cache_stats32->st_rdhits = _sd_cache_stats->st_rdhits; _sd_cache_stats32->st_rdmiss = _sd_cache_stats->st_rdmiss; _sd_cache_stats32->st_wrhits = _sd_cache_stats->st_wrhits; _sd_cache_stats32->st_wrmiss = _sd_cache_stats->st_wrmiss; _sd_cache_stats32->st_blksize = _sd_cache_stats->st_blksize; _sd_cache_stats32->st_lru_blocks = _sd_cache_stats->st_lru_blocks; _sd_cache_stats32->st_lru_noreq = _sd_cache_stats->st_lru_noreq; _sd_cache_stats32->st_lru_req = _sd_cache_stats->st_lru_req; _sd_cache_stats32->st_wlru_inq = _sd_cache_stats->st_wlru_inq; _sd_cache_stats32->st_cachesize = _sd_cache_stats->st_cachesize; _sd_cache_stats32->st_numblocks = _sd_cache_stats->st_numblocks; _sd_cache_stats32->st_wrcancelns = _sd_cache_stats->st_wrcancelns; _sd_cache_stats32->st_destaged = _sd_cache_stats->st_destaged; /* * bcopy the shared stats which has nothing that needs conversion * in them */ bcopy(_sd_cache_stats->st_shared, _sd_cache_stats32->st_shared, sizeof (_sd_shared_t) * sdbc_max_devs); if (copyout(_sd_cache_stats32, uptr, sizeof (_sd_stats32_t) + (sdbc_max_devs - 1) * sizeof (_sd_shared_t))) rc = EFAULT; return (rc); #endif /* _MULTI_DATAMODEL */ } int _sd_get_stats(_sd_stats_t *uptr, int convert_32) { int rc = 0; if (_sd_cache_stats == NULL) { static _sd_stats_t dummy; #ifdef _MULTI_DATAMODEL static _sd_stats32_t dummy32; #endif if (convert_32) { #ifdef _MULTI_DATAMODEL if (copyout(&dummy32, uptr, sizeof (_sd_stats32_t))) rc = EFAULT; #else rc = SDBC_EMODELCONVERT; #endif } else if (copyout(&dummy, uptr, sizeof (_sd_stats_t))) rc = EFAULT; return (rc); } _sd_cache_stats->st_lru_blocks = _sd_lru_q.sq_inq; _sd_cache_stats->st_lru_noreq = _sd_lru_q.sq_noreq_stat; _sd_cache_stats->st_lru_req = _sd_lru_q.sq_req_stat; if (sdbc_safestore) { ssioc_stats_t ss_stats; if (SSOP_CTL(sdbc_safestore, SSIOC_STATS, (uintptr_t)&ss_stats) == 0) _sd_cache_stats->st_wlru_inq = ss_stats.wq_inq; else _sd_cache_stats->st_wlru_inq = 0; } if (convert_32) rc = convert_stats((_sd_stats32_t *)uptr); else if (copyout(_sd_cache_stats, uptr, sizeof (_sd_stats_t) + (sdbc_max_devs - 1) * sizeof (_sd_shared_t))) rc = EFAULT; return (rc); } int _sd_set_hint(int cd, uint_t hint) { int ret = 0; if (FILE_OPENED(cd)) { SDTRACE(ST_ENTER|SDF_HINT, cd, 1, SDT_INV_BL, hint, 0); _sd_cache_files[cd].cd_hint |= (hint & _SD_HINT_MASK); SDTRACE(ST_EXIT|SDF_HINT, cd, 1, SDT_INV_BL, hint, ret); } else ret = EINVAL; return (ret); } int _sd_clear_hint(int cd, uint_t hint) { int ret = 0; if (FILE_OPENED(cd)) { SDTRACE(ST_ENTER|SDF_HINT, cd, 2, SDT_INV_BL, hint, 0); _sd_cache_files[cd].cd_hint &= ~(hint & _SD_HINT_MASK); SDTRACE(ST_EXIT|SDF_HINT, cd, 2, SDT_INV_BL, hint, ret); } else ret = EINVAL; return (ret); } int _sd_get_cd_hint(int cd, uint_t *hint) { *hint = 0; if (FILE_OPENED(cd)) { *hint = _sd_cache_files[cd].cd_hint; return (0); } else return (EINVAL); } static int _sd_node_hint_caller(blind_t hint, int hint_action) { int rc; switch (hint_action) { case NSC_GET_NODE_HINT: rc = _sd_get_node_hint((uint_t *)hint); break; case NSC_SET_NODE_HINT: rc = _sd_set_node_hint((uint_t)(unsigned long)hint); break; case NSC_CLEAR_NODE_HINT: rc = _sd_clear_node_hint((uint_t)(unsigned long)hint); break; default: rc = EINVAL; break; } return (rc); } int _sd_set_node_hint(uint_t hint) { SDTRACE(ST_ENTER|SDF_HINT, SDT_INV_CD, 3, SDT_INV_BL, hint, 0); if ((_sd_node_hint & NSC_NO_FORCED_WRTHRU) && (hint & NSC_FORCED_WRTHRU)) return (EINVAL); _sd_node_hint |= (hint & _SD_HINT_MASK); SDTRACE(ST_EXIT|SDF_HINT, SDT_INV_CD, 3, SDT_INV_BL, hint, 0); return (0); } int _sd_clear_node_hint(uint_t hint) { SDTRACE(ST_ENTER|SDF_HINT, SDT_INV_CD, 4, SDT_INV_BL, hint, 0); _sd_node_hint &= ~(hint & _SD_HINT_MASK); SDTRACE(ST_EXIT|SDF_HINT, SDT_INV_CD, 4, SDT_INV_BL, hint, 0); return (0); } int _sd_get_node_hint(uint_t *hint) { *hint = _sd_node_hint; return (0); } int _sd_get_partsize(blind_t xcd, nsc_size_t *ptr) { int cd = (int)(unsigned long)xcd; if (FILE_OPENED(cd)) { *ptr = _sd_cache_files[cd].cd_info->sh_filesize; return (0); } else return (EINVAL); } int _sd_get_maxfbas(blind_t xcd, int flag, nsc_size_t *ptr) { int cd = (int)(unsigned long)xcd; if (!FILE_OPENED(cd)) return (EINVAL); if (flag & NSC_CACHEBLK) *ptr = BLK_FBAS; else *ptr = sdbc_max_fbas; return (0); } int _sd_control(blind_t xcd, int cmd, void *ptr, int len) { _sd_cd_info_t *cdi; int cd = (int)(unsigned long)xcd; cdi = &(_sd_cache_files[cd]); return (nsc_control(cdi->cd_rawfd, cmd, ptr, len)); } int _sd_discard_pinned(blind_t xcd, nsc_off_t fba_pos, nsc_size_t fba_len) { int cd = (int)(unsigned long)xcd; _sd_cctl_t *cc_ent, **cc_lst, **cc_tmp, *nxt; ss_centry_info_t *wctl; int found = 0; nsc_off_t cblk; _sd_cd_info_t *cdi = &_sd_cache_files[cd]; int rc; if ((!FILE_OPENED(cd)) || (!cdi->cd_info->sh_failed)) { return (EINVAL); } for (cblk = FBA_TO_BLK_NUM(fba_pos); cblk < FBA_TO_BLK_LEN(fba_pos + fba_len); cblk++) { if (cc_ent = (_sd_cctl_t *)_sd_hash_search(cd, cblk, _sd_htable)) { if (!CENTRY_PINNED(cc_ent)) continue; /* * remove cc_ent from failed links * cc_lst - pointer to "cc_dirty_link" pointer * starts at &cd_failed_head. * cc_tmp - pointer to "cc_dirty_next" * except when equal to cc_lst. */ mutex_enter(&cdi->cd_lock); cc_tmp = cc_lst = &(cdi->cd_fail_head); while (*cc_tmp != cc_ent) { cc_tmp = &((*cc_tmp)->cc_dirty_next); if (!*cc_tmp) cc_lst = &((*cc_lst)->cc_dirty_link), cc_tmp = cc_lst; } if (*cc_tmp) { found++; if (cc_lst != cc_tmp) /* break chain */ *cc_tmp = NULL; nxt = cc_ent->cc_dirty_next; if (nxt) { nxt->cc_dirty_link = (*cc_lst)->cc_dirty_link; *cc_lst = nxt; } else { *cc_lst = (*cc_lst)->cc_dirty_link; } cdi->cd_info->sh_numfail--; nsc_unpinned_data(cdi->cd_iodev, BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)), BLK_FBAS); } mutex_exit(&cdi->cd_lock); /* clear dirty bits */ /* was FAST */ mutex_enter(&cc_ent->cc_lock); cc_ent->cc_valid = cc_ent->cc_dirty = 0; cc_ent->cc_flag &= ~(CC_QHEAD|CC_PEND_DIRTY|CC_PINNED); cc_ent->cc_dirty_link = NULL; wctl = cc_ent->cc_write; cc_ent->cc_write = NULL; /* was FAST */ mutex_exit(&cc_ent->cc_lock); /* release cache block to head of LRU */ if (wctl) { wctl->sc_flag = 0; wctl->sc_dirty = 0; SSOP_SETCENTRY(sdbc_safestore, wctl); SSOP_DEALLOCRESOURCE(sdbc_safestore, wctl->sc_res); } if (!sdbc_use_dmchain) _sd_requeue_head(cc_ent); } } rc = found ? NSC_DONE : EINVAL; return (rc); } /* * Handle allocation */ _sd_buf_hlist_t _sd_handle_list; /* * _sdbc_handles_unload - cache is being unloaded. */ void _sdbc_handles_unload(void) { mutex_destroy(&_sd_handle_list.hl_lock); } /* * _sdbc_handles_load - cache is being unloaded. */ int _sdbc_handles_load(void) { mutex_init(&_sd_handle_list.hl_lock, NULL, MUTEX_DRIVER, NULL); return (0); } int _sdbc_handles_configure() { _sd_handle_list.hl_count = 0; _sd_handle_list.hl_top.bh_next = &_sd_handle_list.hl_top; _sd_handle_list.hl_top.bh_prev = &_sd_handle_list.hl_top; return (0); } /* * _sdbc_handles_deconfigure - cache is being deconfigured */ void _sdbc_handles_deconfigure(void) { _sd_handle_list.hl_count = 0; } _sd_buf_handle_t * _sd_alloc_handle(sdbc_callback_fn_t d_cb, sdbc_callback_fn_t r_cb, sdbc_callback_fn_t w_cb) { _sd_buf_handle_t *handle; handle = (_sd_buf_handle_t *)kmem_zalloc(sizeof (_sd_buf_handle_t), KM_SLEEP); /* maintain list and count for debugging */ mutex_enter(&_sd_handle_list.hl_lock); handle->bh_prev = &_sd_handle_list.hl_top; handle->bh_next = _sd_handle_list.hl_top.bh_next; _sd_handle_list.hl_top.bh_next->bh_prev = handle; _sd_handle_list.hl_top.bh_next = handle; ++_sd_handle_list.hl_count; mutex_exit(&_sd_handle_list.hl_lock); #if !defined(_SD_NOCHECKS) ASSERT(!(handle->bh_flag & (NSC_HALLOCATED | NSC_HACTIVE))); #endif handle->bh_disconnect_cb = d_cb; handle->bh_read_cb = r_cb; handle->bh_write_cb = w_cb; handle->bh_flag |= NSC_HALLOCATED; handle->bh_alloc_thread = nsc_threadp(); return (handle); } int _sd_free_handle(_sd_buf_handle_t *handle) { if ((handle->bh_flag & NSC_HALLOCATED) == 0) { cmn_err(CE_WARN, "!sdbc(_sd_free_handle) handle %p not valid", (void *)handle); DTRACE_PROBE(_sd_free_handle_end); return (EINVAL); } if (_SD_HANDLE_ACTIVE(handle)) { cmn_err(CE_WARN, "!sdbc(_sd_free_handle) attempt to free active handle %p", (void *)handle); DTRACE_PROBE1(free_handle_active, int, handle->bh_flag); return (EINVAL); } /* remove from queue before free */ mutex_enter(&_sd_handle_list.hl_lock); handle->bh_prev->bh_next = handle->bh_next; handle->bh_next->bh_prev = handle->bh_prev; --_sd_handle_list.hl_count; mutex_exit(&_sd_handle_list.hl_lock); kmem_free(handle, sizeof (_sd_buf_handle_t)); return (0); } #if !defined (_SD_8K_BLKSIZE) #define _SD_MAX_MAP 0x100 #else /* !(_SD_8K_BLKSIZE) */ #define _SD_MAX_MAP 0x10000 #endif /* !(_SD_8K_BLKSIZE) */ char _sd_contig_bmap[_SD_MAX_MAP]; _sd_map_info_t _sd_lookup_map[_SD_MAX_MAP]; void _sd_init_contig_bmap(void) { int i, j; for (i = 1; i < _SD_MAX_MAP; i = ((i << 1) | 1)) for (j = i; j < _SD_MAX_MAP; j <<= 1) _sd_contig_bmap[j] = 1; } void _sd_init_lookup_map(void) { unsigned int i, j, k; int stpos, len; _sd_bitmap_t mask; for (i = 0; i < _SD_MAX_MAP; i++) { for (j = i, k = 0; j && ((j & 1) == 0); j >>= 1, k++) ; stpos = k; _sd_lookup_map[i].mi_stpos = (unsigned char)k; for (k = 0; j & 1; j >>= 1, k++) ; len = k; _sd_lookup_map[i].mi_len = (unsigned char)k; _sd_lookup_map[i].mi_mask = SDBC_GET_BITS(stpos, len); } for (i = 0; i < _SD_MAX_MAP; i++) { mask = (_sd_bitmap_t)i; for (j = 0; mask; j++) SDBC_LOOKUP_MODIFY(mask); _sd_lookup_map[i].mi_dirty_count = (unsigned char)j; } for (i = 0; i < _SD_MAX_MAP; i++) { _sd_lookup_map[i].mi_io_count = SDBC_LOOKUP_DTCOUNT(i); mask = ~i; _sd_lookup_map[i].mi_io_count += SDBC_LOOKUP_DTCOUNT(mask); } } nsc_def_t _sd_sdbc_def[] = { "Open", (uintptr_t)_sd_open_io, 0, "Close", (uintptr_t)_sd_close_io, 0, "Attach", (uintptr_t)_sdbc_io_attach_cd, 0, "Detach", (uintptr_t)_sdbc_io_detach_cd, 0, "AllocBuf", (uintptr_t)_sd_alloc_buf, 0, "FreeBuf", (uintptr_t)_sd_free_buf, 0, "Read", (uintptr_t)_sd_read, 0, "Write", (uintptr_t)_sd_write, 0, "Zero", (uintptr_t)_sd_zero, 0, "Copy", (uintptr_t)_sd_copy, 0, "CopyDirect", (uintptr_t)_sd_copy_direct, 0, "Uncommit", (uintptr_t)_sd_uncommit, 0, "AllocHandle", (uintptr_t)_sd_alloc_handle, 0, "FreeHandle", (uintptr_t)_sd_free_handle, 0, "Discard", (uintptr_t)_sd_discard_pinned, 0, "Sizes", (uintptr_t)_sd_cache_sizes, 0, "GetPinned", (uintptr_t)_sd_get_pinned, 0, "NodeHints", (uintptr_t)_sd_node_hint_caller, 0, "PartSize", (uintptr_t)_sd_get_partsize, 0, "MaxFbas", (uintptr_t)_sd_get_maxfbas, 0, "Control", (uintptr_t)_sd_control, 0, "Provide", NSC_CACHE, 0, 0, 0, 0 }; /* * do the SD_GET_CD_CLUSTER_DATA ioctl (get the global filename data) */ /* ARGSUSED */ int sd_get_file_info_data(char *uaddrp) { return (ENOTTY); } /* * do the SD_GET_CD_CLUSTER_SIZE ioctl (get size of global filename area) */ int sd_get_file_info_size(void *uaddrp) { if (copyout(&_sdbc_gl_file_info_size, uaddrp, sizeof (_sdbc_gl_file_info_size))) { return (EFAULT); } return (0); } /* * SD_GET_GLMUL_SIZES ioctl * get sizes of the global info regions (for this node only) */ /* ARGSUSED */ int sd_get_glmul_sizes(int *uaddrp) { return (ENOTTY); } /* * SD_GET_GLMUL_INFO ioctl * get the global metadata for write blocks (for this node only) */ /* ARGSUSED */ int sd_get_glmul_info(char *uaddrp) { return (ENOTTY); } int sdbc_global_stats_update(kstat_t *ksp, int rw) { sdbc_global_stats_t *sdbc_gstats; _sd_stats_t *gstats_vars; uint_t hint; sdbc_gstats = (sdbc_global_stats_t *)(ksp->ks_data); gstats_vars = _sd_cache_stats; if (rw == KSTAT_WRITE) { return (EACCES); } /* default to READ */ sdbc_gstats->ci_sdbc_count.value.ul = gstats_vars->st_count; sdbc_gstats->ci_sdbc_loc_count.value.ul = gstats_vars->st_loc_count; sdbc_gstats->ci_sdbc_rdhits.value.ul = (ulong_t)gstats_vars->st_rdhits; sdbc_gstats->ci_sdbc_rdmiss.value.ul = (ulong_t)gstats_vars->st_rdmiss; sdbc_gstats->ci_sdbc_wrhits.value.ul = (ulong_t)gstats_vars->st_wrhits; sdbc_gstats->ci_sdbc_wrmiss.value.ul = (ulong_t)gstats_vars->st_wrmiss; sdbc_gstats->ci_sdbc_blksize.value.ul = (ulong_t)gstats_vars->st_blksize; sdbc_gstats->ci_sdbc_lru_blocks.value.ul = (ulong_t)_sd_lru_q.sq_inq; #ifdef DEBUG sdbc_gstats->ci_sdbc_lru_noreq.value.ul = (ulong_t)_sd_lru_q.sq_noreq_stat; sdbc_gstats->ci_sdbc_lru_req.value.ul = (ulong_t)_sd_lru_q.sq_req_stat; #endif sdbc_gstats->ci_sdbc_wlru_inq.value.ul = (ulong_t)gstats_vars->st_wlru_inq; sdbc_gstats->ci_sdbc_cachesize.value.ul = (ulong_t)gstats_vars->st_cachesize; sdbc_gstats->ci_sdbc_numblocks.value.ul = (ulong_t)gstats_vars->st_numblocks; sdbc_gstats->ci_sdbc_wrcancelns.value.ul = (ulong_t)gstats_vars->st_wrcancelns; sdbc_gstats->ci_sdbc_destaged.value.ul = (ulong_t)gstats_vars->st_destaged; sdbc_gstats->ci_sdbc_num_shared.value.ul = (ulong_t)sdbc_max_devs; (void) _sd_get_node_hint(&hint); sdbc_gstats->ci_sdbc_nodehints.value.ul = (ulong_t)hint; return (0); } int sdbc_cd_stats_update(kstat_t *ksp, int rw) { sdbc_cd_stats_t *sdbc_shstats; _sd_shared_t *shstats_vars; int name_len; uint_t hint; sdbc_shstats = (sdbc_cd_stats_t *)(ksp->ks_data); shstats_vars = (_sd_shared_t *)(ksp->ks_private); if (rw == KSTAT_WRITE) { return (EACCES); } /* copy tail of filename to kstat. leave 1 byte for null char */ if (shstats_vars->sh_filename != NULL) { name_len = (int)strlen(shstats_vars->sh_filename); name_len -= (KSTAT_DATA_CHAR_LEN - 1); if (name_len < 0) { name_len = 0; } (void) strlcpy(sdbc_shstats->ci_sdbc_vol_name.value.c, shstats_vars->sh_filename + name_len, KSTAT_DATA_CHAR_LEN); } else { cmn_err(CE_WARN, "!Kstat error: no volume name associated " "with cache descriptor"); } sdbc_shstats->ci_sdbc_failed.value.ul = (ulong_t)shstats_vars->sh_failed; sdbc_shstats->ci_sdbc_cd.value.ul = (ulong_t)shstats_vars->sh_cd; sdbc_shstats->ci_sdbc_cache_read.value.ul = (ulong_t)shstats_vars->sh_cache_read; sdbc_shstats->ci_sdbc_cache_write.value.ul = (ulong_t)shstats_vars->sh_cache_write; sdbc_shstats->ci_sdbc_disk_read.value.ul = (ulong_t)shstats_vars->sh_disk_read; sdbc_shstats->ci_sdbc_disk_write.value.ul = (ulong_t)shstats_vars->sh_disk_write; #ifdef NSC_MULTI_TERABYTE sdbc_shstats->ci_sdbc_filesize.value.ui64 = (uint64_t)shstats_vars->sh_filesize; #else sdbc_shstats->ci_sdbc_filesize.value.ul = (ulong_t)shstats_vars->sh_filesize; #endif sdbc_shstats->ci_sdbc_numdirty.value.ul = (ulong_t)shstats_vars->sh_numdirty; sdbc_shstats->ci_sdbc_numio.value.ul = (ulong_t)shstats_vars->sh_numio; sdbc_shstats->ci_sdbc_numfail.value.ul = (ulong_t)shstats_vars->sh_numfail; sdbc_shstats->ci_sdbc_destaged.value.ul = (ulong_t)shstats_vars->sh_destaged; sdbc_shstats->ci_sdbc_wrcancelns.value.ul = (ulong_t)shstats_vars->sh_wrcancelns; (void) _sd_get_cd_hint(shstats_vars->sh_cd, &hint); sdbc_shstats->ci_sdbc_cdhints.value.ul = (ulong_t)hint; return (0); } /* * cd_kstat_add * * Installs all kstats and associated infrastructure (mutex, buffer), * associated with a particular cache descriptor. This function is called * when the cache descriptor is opened in _sd_open(). * "cd" -- cache descriptor number whose kstats we wish to add * returns: 0 on success, -1 on failure */ static int cd_kstat_add(int cd) { char name[KSTAT_STRLEN]; if (cd < 0 || cd >= sdbc_max_devs) { cmn_err(CE_WARN, "!invalid cache descriptor: %d", cd); return (-1); } /* create a regular kstat for this cache descriptor */ if (!sdbc_cd_kstats) { cmn_err(CE_WARN, "!sdbc_cd_kstats not allocated"); return (-1); } (void) snprintf(name, KSTAT_STRLEN, "%s%d", SDBC_KSTAT_CDSTATS, cd); sdbc_cd_kstats[cd] = kstat_create(SDBC_KSTAT_MODULE, cd, name, SDBC_KSTAT_CLASS, KSTAT_TYPE_NAMED, sizeof (sdbc_cd_stats)/sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); if (sdbc_cd_kstats[cd] != NULL) { sdbc_cd_kstats[cd]->ks_data = &sdbc_cd_stats; sdbc_cd_kstats[cd]->ks_update = sdbc_cd_stats_update; sdbc_cd_kstats[cd]->ks_private = &_sd_cache_stats->st_shared[cd]; kstat_install(sdbc_cd_kstats[cd]); } else { cmn_err(CE_WARN, "!cdstats %d kstat allocation failed", cd); } /* create an I/O kstat for this cache descriptor */ if (!sdbc_cd_io_kstats) { cmn_err(CE_WARN, "!sdbc_cd_io_kstats not allocated"); return (-1); } (void) snprintf(name, KSTAT_STRLEN, "%s%d", SDBC_IOKSTAT_CDSTATS, cd); sdbc_cd_io_kstats[cd] = kstat_create( SDBC_KSTAT_MODULE, cd, name, "disk", KSTAT_TYPE_IO, 1, 0); if (sdbc_cd_io_kstats[cd]) { if (!sdbc_cd_io_kstats_mutexes) { cmn_err(CE_WARN, "!sdbc_cd_io_kstats_mutexes not " "allocated"); return (-1); } mutex_init(&sdbc_cd_io_kstats_mutexes[cd], NULL, MUTEX_DRIVER, NULL); sdbc_cd_io_kstats[cd]->ks_lock = &sdbc_cd_io_kstats_mutexes[cd]; kstat_install(sdbc_cd_io_kstats[cd]); } else { cmn_err(CE_WARN, "!sdbc cd %d io kstat allocation failed", cd); } return (0); } /* * cd_kstat_remove * * Uninstalls all kstats and associated infrastructure (mutex, buffer), * associated with a particular cache descriptor. This function is called * when the cache descriptor is closed in _sd_close(). * "cd" -- cache descriptor number whose kstats we wish to remove * returns: 0 on success, -1 on failure */ static int cd_kstat_remove(int cd) { if (cd < 0 || cd >= sdbc_max_devs) { cmn_err(CE_WARN, "!invalid cache descriptor: %d", cd); return (-1); } /* delete the regular kstat corresponding to this cache descriptor */ if (sdbc_cd_kstats && sdbc_cd_kstats[cd]) { kstat_delete(sdbc_cd_kstats[cd]); sdbc_cd_kstats[cd] = NULL; } /* delete the I/O kstat corresponding to this cache descriptor */ if (sdbc_cd_io_kstats && sdbc_cd_io_kstats[cd]) { kstat_delete(sdbc_cd_io_kstats[cd]); sdbc_cd_io_kstats[cd] = NULL; if (sdbc_cd_io_kstats_mutexes) { /* destroy the mutex associated with this I/O kstat */ mutex_destroy(&sdbc_cd_io_kstats_mutexes[cd]); } } return (0); } #ifdef DEBUG /* * kstat update */ int sdbc_dynmem_kstat_update_dm(kstat_t *ksp, int rw) { sdbc_dynmem_dm_t *sdbc_dynmem; _dm_process_vars_t *process_vars; _dm_process_vars_t local_dm_process_vars; simplect_dm++; sdbc_dynmem = (sdbc_dynmem_dm_t *)(ksp->ks_data); /* global dynmem_processing_dm */ process_vars = (_dm_process_vars_t *)(ksp->ks_private); if (rw == KSTAT_WRITE) { simplect_dm = sdbc_dynmem->ci_sdbc_simplect.value.ul; local_dm_process_vars.monitor_dynmem_process = sdbc_dynmem->ci_sdbc_monitor_dynmem.value.ul; local_dm_process_vars.max_dyn_list = sdbc_dynmem->ci_sdbc_max_dyn_list.value.ul; local_dm_process_vars.cache_aging_ct1 = sdbc_dynmem->ci_sdbc_cache_aging_ct1.value.ul; local_dm_process_vars.cache_aging_ct2 = sdbc_dynmem->ci_sdbc_cache_aging_ct2.value.ul; local_dm_process_vars.cache_aging_ct3 = sdbc_dynmem->ci_sdbc_cache_aging_ct3.value.ul; local_dm_process_vars.cache_aging_sec1 = sdbc_dynmem->ci_sdbc_cache_aging_sec1.value.ul; local_dm_process_vars.cache_aging_sec2 = sdbc_dynmem->ci_sdbc_cache_aging_sec2.value.ul; local_dm_process_vars.cache_aging_sec3 = sdbc_dynmem->ci_sdbc_cache_aging_sec3.value.ul; local_dm_process_vars.cache_aging_pcnt1 = sdbc_dynmem->ci_sdbc_cache_aging_pcnt1.value.ul; local_dm_process_vars.cache_aging_pcnt2 = sdbc_dynmem->ci_sdbc_cache_aging_pcnt2.value.ul; local_dm_process_vars.max_holds_pcnt = sdbc_dynmem->ci_sdbc_max_holds_pcnt.value.ul; local_dm_process_vars.process_directive = sdbc_dynmem->ci_sdbc_process_directive.value.ul; (void) sdbc_edit_xfer_process_vars_dm(&local_dm_process_vars); if (process_vars->process_directive & WAKE_DEALLOC_THREAD_DM) { process_vars->process_directive &= ~WAKE_DEALLOC_THREAD_DM; mutex_enter(&dynmem_processing_dm.thread_dm_lock); cv_broadcast(&dynmem_processing_dm.thread_dm_cv); mutex_exit(&dynmem_processing_dm.thread_dm_lock); } return (0); } /* default to READ */ sdbc_dynmem->ci_sdbc_simplect.value.ul = simplect_dm; sdbc_dynmem->ci_sdbc_monitor_dynmem.value.ul = process_vars->monitor_dynmem_process; sdbc_dynmem->ci_sdbc_max_dyn_list.value.ul = process_vars->max_dyn_list; sdbc_dynmem->ci_sdbc_cache_aging_ct1.value.ul = process_vars->cache_aging_ct1; sdbc_dynmem->ci_sdbc_cache_aging_ct2.value.ul = process_vars->cache_aging_ct2; sdbc_dynmem->ci_sdbc_cache_aging_ct3.value.ul = process_vars->cache_aging_ct3; sdbc_dynmem->ci_sdbc_cache_aging_sec1.value.ul = process_vars->cache_aging_sec1; sdbc_dynmem->ci_sdbc_cache_aging_sec2.value.ul = process_vars->cache_aging_sec2; sdbc_dynmem->ci_sdbc_cache_aging_sec3.value.ul = process_vars->cache_aging_sec3; sdbc_dynmem->ci_sdbc_cache_aging_pcnt1.value.ul = process_vars->cache_aging_pcnt1; sdbc_dynmem->ci_sdbc_cache_aging_pcnt2.value.ul = process_vars->cache_aging_pcnt2; sdbc_dynmem->ci_sdbc_max_holds_pcnt.value.ul = process_vars->max_holds_pcnt; sdbc_dynmem->ci_sdbc_process_directive.value.ul = process_vars->process_directive; sdbc_dynmem->ci_sdbc_alloc_ct.value.ul = process_vars->alloc_ct; sdbc_dynmem->ci_sdbc_dealloc_ct.value.ul = process_vars->dealloc_ct; sdbc_dynmem->ci_sdbc_history.value.ul = process_vars->history; sdbc_dynmem->ci_sdbc_nodatas.value.ul = process_vars->nodatas; sdbc_dynmem->ci_sdbc_candidates.value.ul = process_vars->candidates; sdbc_dynmem->ci_sdbc_deallocs.value.ul = process_vars->deallocs; sdbc_dynmem->ci_sdbc_hosts.value.ul = process_vars->hosts; sdbc_dynmem->ci_sdbc_pests.value.ul = process_vars->pests; sdbc_dynmem->ci_sdbc_metas.value.ul = process_vars->metas; sdbc_dynmem->ci_sdbc_holds.value.ul = process_vars->holds; sdbc_dynmem->ci_sdbc_others.value.ul = process_vars->others; sdbc_dynmem->ci_sdbc_notavail.value.ul = process_vars->notavail; return (0); } #endif