1/* 2 md_k.h : kernel internal structure of the Linux MD driver 3 Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 You should have received a copy of the GNU General Public License 11 (for example /usr/src/linux/COPYING); if not, write to the Free 12 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 13*/ 14 15#ifndef _MD_K_H 16#define _MD_K_H 17 18#define MD_RESERVED 0UL 19#define LINEAR 1UL 20#define RAID0 2UL 21#define RAID1 3UL 22#define RAID5 4UL 23#define TRANSLUCENT 5UL 24#define HSM 6UL 25#define MULTIPATH 7UL 26#define MAX_PERSONALITY 8UL 27 28static inline int pers_to_level (int pers) 29{ 30 switch (pers) { 31 case MULTIPATH: return -4; 32 case HSM: return -3; 33 case TRANSLUCENT: return -2; 34 case LINEAR: return -1; 35 case RAID0: return 0; 36 case RAID1: return 1; 37 case RAID5: return 5; 38 } 39 BUG(); 40 return MD_RESERVED; 41} 42 43static inline int level_to_pers (int level) 44{ 45 switch (level) { 46 case -4: return MULTIPATH; 47 case -3: return HSM; 48 case -2: return TRANSLUCENT; 49 case -1: return LINEAR; 50 case 0: return RAID0; 51 case 1: return RAID1; 52 case 4: 53 case 5: return RAID5; 54 } 55 return MD_RESERVED; 56} 57 58typedef struct mddev_s mddev_t; 59typedef struct mdk_rdev_s mdk_rdev_t; 60 61#if MINORBITS != 8 62#error MD does not handle bigger kdev yet 63#endif 64 65#define MAX_MD_DEVS (1<<MINORBITS) /* Max number of md dev */ 66 67/* 68 * Maps a kdev to an mddev/subdev. How 'data' is handled is up to 69 * the personality. (eg. HSM uses this to identify individual LVs) 70 */ 71typedef struct dev_mapping_s { 72 mddev_t *mddev; 73 void *data; 74} dev_mapping_t; 75 76extern dev_mapping_t mddev_map [MAX_MD_DEVS]; 77 78static inline mddev_t * kdev_to_mddev (kdev_t dev) 79{ 80 if (MAJOR(dev) != MD_MAJOR) 81 BUG(); 82 return mddev_map[MINOR(dev)].mddev; 83} 84 85/* 86 * options passed in raidrun: 87 */ 88 89#define MAX_CHUNK_SIZE (4096*1024) 90 91/* 92 * default readahead 93 */ 94#define MD_READAHEAD vm_max_readahead 95 96static inline int disk_faulty(mdp_disk_t * d) 97{ 98 return d->state & (1 << MD_DISK_FAULTY); 99} 100 101static inline int disk_active(mdp_disk_t * d) 102{ 103 return d->state & (1 << MD_DISK_ACTIVE); 104} 105 106static inline int disk_sync(mdp_disk_t * d) 107{ 108 return d->state & (1 << MD_DISK_SYNC); 109} 110 111static inline int disk_spare(mdp_disk_t * d) 112{ 113 return !disk_sync(d) && !disk_active(d) && !disk_faulty(d); 114} 115 116static inline int disk_removed(mdp_disk_t * d) 117{ 118 return d->state & (1 << MD_DISK_REMOVED); 119} 120 121static inline void mark_disk_faulty(mdp_disk_t * d) 122{ 123 d->state |= (1 << MD_DISK_FAULTY); 124} 125 126static inline void mark_disk_active(mdp_disk_t * d) 127{ 128 d->state |= (1 << MD_DISK_ACTIVE); 129} 130 131static inline void mark_disk_sync(mdp_disk_t * d) 132{ 133 d->state |= (1 << MD_DISK_SYNC); 134} 135 136static inline void mark_disk_spare(mdp_disk_t * d) 137{ 138 d->state = 0; 139} 140 141static inline void mark_disk_removed(mdp_disk_t * d) 142{ 143 d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED); 144} 145 146static inline void mark_disk_inactive(mdp_disk_t * d) 147{ 148 d->state &= ~(1 << MD_DISK_ACTIVE); 149} 150 151static inline void mark_disk_nonsync(mdp_disk_t * d) 152{ 153 d->state &= ~(1 << MD_DISK_SYNC); 154} 155 156/* 157 * MD's 'extended' device 158 */ 159struct mdk_rdev_s 160{ 161 struct md_list_head same_set; /* RAID devices within the same set */ 162 struct md_list_head all; /* all RAID devices */ 163 struct md_list_head pending; /* undetected RAID devices */ 164 165 kdev_t dev; /* Device number */ 166 kdev_t old_dev; /* "" when it was last imported */ 167 unsigned long size; /* Device size (in blocks) */ 168 mddev_t *mddev; /* RAID array if running */ 169 unsigned long last_events; /* IO event timestamp */ 170 171 struct block_device *bdev; /* block device handle */ 172 173 mdp_super_t *sb; 174 unsigned long sb_offset; 175 176 int alias_device; /* device alias to the same disk */ 177 int faulty; /* if faulty do not issue IO requests */ 178 int desc_nr; /* descriptor index in the superblock */ 179}; 180 181 182/* 183 * disk operations in a working array: 184 */ 185#define DISKOP_SPARE_INACTIVE 0 186#define DISKOP_SPARE_WRITE 1 187#define DISKOP_SPARE_ACTIVE 2 188#define DISKOP_HOT_REMOVE_DISK 3 189#define DISKOP_HOT_ADD_DISK 4 190 191typedef struct mdk_personality_s mdk_personality_t; 192 193struct mddev_s 194{ 195 void *private; 196 mdk_personality_t *pers; 197 int __minor; 198 mdp_super_t *sb; 199 int nb_dev; 200 struct md_list_head disks; 201 int sb_dirty; 202 mdu_param_t param; 203 int ro; 204 unsigned long curr_resync; /* blocks scheduled */ 205 unsigned long resync_mark; /* a recent timestamp */ 206 unsigned long resync_mark_cnt;/* blocks written at resync_mark */ 207 char *name; 208 int recovery_running; 209 struct semaphore reconfig_sem; 210 struct semaphore recovery_sem; 211 struct semaphore resync_sem; 212 atomic_t active; 213 214 atomic_t recovery_active; /* blocks scheduled, but not written */ 215 md_wait_queue_head_t recovery_wait; 216 217 struct md_list_head all_mddevs; 218}; 219 220struct mdk_personality_s 221{ 222 char *name; 223 int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh); 224 int (*run)(mddev_t *mddev); 225 int (*stop)(mddev_t *mddev); 226 int (*status)(char *page, mddev_t *mddev); 227 int (*error_handler)(mddev_t *mddev, kdev_t dev); 228 229/* 230 * Some personalities (RAID-1, RAID-5) can have disks hot-added and 231 * hot-removed. Hot removal is different from failure. (failure marks 232 * a disk inactive, but the disk is still part of the array) The interface 233 * to such operations is the 'pers->diskop()' function, can be NULL. 234 * 235 * the diskop function can change the pointer pointing to the incoming 236 * descriptor, but must do so very carefully. (currently only 237 * SPARE_ACTIVE expects such a change) 238 */ 239 int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state); 240 241 int (*stop_resync)(mddev_t *mddev); 242 int (*restart_resync)(mddev_t *mddev); 243 int (*sync_request)(mddev_t *mddev, unsigned long block_nr); 244}; 245 246 247/* 248 * Currently we index md_array directly, based on the minor 249 * number. This will have to change to dynamic allocation 250 * once we start supporting partitioning of md devices. 251 */ 252static inline int mdidx (mddev_t * mddev) 253{ 254 return mddev->__minor; 255} 256 257static inline kdev_t mddev_to_kdev(mddev_t * mddev) 258{ 259 return MKDEV(MD_MAJOR, mdidx(mddev)); 260} 261 262extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev); 263extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr); 264extern mdp_disk_t *get_spare(mddev_t *mddev); 265 266/* 267 * iterates through some rdev ringlist. It's safe to remove the 268 * current 'rdev'. Dont touch 'tmp' though. 269 */ 270#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \ 271 \ 272 for (tmp = head.next; \ 273 rdev = md_list_entry(tmp, mdk_rdev_t, field), \ 274 tmp = tmp->next, tmp->prev != &head \ 275 ; ) 276/* 277 * iterates through the 'same array disks' ringlist 278 */ 279#define ITERATE_RDEV(mddev,rdev,tmp) \ 280 ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp) 281 282/* 283 * Same as above, but assumes that the device has rdev->desc_nr numbered 284 * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order. 285 */ 286#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \ 287 for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++) 288 289 290/* 291 * Iterates through all 'RAID managed disks' 292 */ 293#define ITERATE_RDEV_ALL(rdev,tmp) \ 294 ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp) 295 296/* 297 * Iterates through 'pending RAID disks' 298 */ 299#define ITERATE_RDEV_PENDING(rdev,tmp) \ 300 ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp) 301 302/* 303 * iterates through all used mddevs in the system. 304 */ 305#define ITERATE_MDDEV(mddev,tmp) \ 306 \ 307 for (tmp = all_mddevs.next; \ 308 mddev = md_list_entry(tmp, mddev_t, all_mddevs), \ 309 tmp = tmp->next, tmp->prev != &all_mddevs \ 310 ; ) 311 312static inline int lock_mddev (mddev_t * mddev) 313{ 314 return down_interruptible(&mddev->reconfig_sem); 315} 316 317static inline void unlock_mddev (mddev_t * mddev) 318{ 319 up(&mddev->reconfig_sem); 320} 321 322#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \ 323 x = y; y = __tmp; } while (0) 324 325typedef struct mdk_thread_s { 326 void (*run) (void *data); 327 void *data; 328 md_wait_queue_head_t wqueue; 329 unsigned long flags; 330 struct completion *event; 331 struct task_struct *tsk; 332 const char *name; 333} mdk_thread_t; 334 335#define THREAD_WAKEUP 0 336 337#define MAX_DISKNAME_LEN 64 338 339typedef struct dev_name_s { 340 struct md_list_head list; 341 kdev_t dev; 342 char namebuf [MAX_DISKNAME_LEN]; 343 char *name; 344} dev_name_t; 345 346 347#define __wait_event_lock_irq(wq, condition, lock) \ 348do { \ 349 wait_queue_t __wait; \ 350 init_waitqueue_entry(&__wait, current); \ 351 \ 352 add_wait_queue(&wq, &__wait); \ 353 for (;;) { \ 354 set_current_state(TASK_UNINTERRUPTIBLE); \ 355 if (condition) \ 356 break; \ 357 spin_unlock_irq(&lock); \ 358 run_task_queue(&tq_disk); \ 359 schedule(); \ 360 spin_lock_irq(&lock); \ 361 } \ 362 current->state = TASK_RUNNING; \ 363 remove_wait_queue(&wq, &__wait); \ 364} while (0) 365 366#define wait_event_lock_irq(wq, condition, lock) \ 367do { \ 368 if (condition) \ 369 break; \ 370 __wait_event_lock_irq(wq, condition, lock); \ 371} while (0) 372 373 374#define __wait_disk_event(wq, condition) \ 375do { \ 376 wait_queue_t __wait; \ 377 init_waitqueue_entry(&__wait, current); \ 378 \ 379 add_wait_queue(&wq, &__wait); \ 380 for (;;) { \ 381 set_current_state(TASK_UNINTERRUPTIBLE); \ 382 if (condition) \ 383 break; \ 384 run_task_queue(&tq_disk); \ 385 schedule(); \ 386 } \ 387 current->state = TASK_RUNNING; \ 388 remove_wait_queue(&wq, &__wait); \ 389} while (0) 390 391#define wait_disk_event(wq, condition) \ 392do { \ 393 if (condition) \ 394 break; \ 395 __wait_disk_event(wq, condition); \ 396} while (0) 397 398#endif 399 400