/* $NetBSD: rf_disks.c,v 1.93.4.2 2024/05/04 12:04:56 martin Exp $ */ /*- * Copyright (c) 1999 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Greg Oster * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. * * Author: Mark Holland * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /*************************************************************** * rf_disks.c -- code to perform operations on the actual disks ***************************************************************/ #include __KERNEL_RCSID(0, "$NetBSD: rf_disks.c,v 1.93.4.2 2024/05/04 12:04:56 martin Exp $"); #include #include "rf_raid.h" #include "rf_alloclist.h" #include "rf_driver.h" #include "rf_utils.h" #include "rf_general.h" #include "rf_options.h" #include "rf_kintf.h" #include "rf_netbsd.h" #include #include #include #include #include #include #include /* for pathbuf */ #include #include /* for v_rdev */ static int rf_AllocDiskStructures(RF_Raid_t *, RF_Config_t *); static void rf_print_label_status( RF_Raid_t *, int, char *, RF_ComponentLabel_t *); static int rf_check_label_vitals( RF_Raid_t *, int, int, char *, RF_ComponentLabel_t *, int, int ); #define DPRINTF6(a,b,c,d,e,f) if (rf_diskDebug) printf(a,b,c,d,e,f) #define DPRINTF7(a,b,c,d,e,f,g) if (rf_diskDebug) printf(a,b,c,d,e,f,g) /************************************************************************** * * initialize the disks comprising the array * * We want the spare disks to have regular row,col numbers so that we can * easily substitue a spare for a failed disk. But, the driver code assumes * throughout that the array contains numRow by numCol _non-spare_ disks, so * it's not clear how to fit in the spares. This is an unfortunate holdover * from raidSim. The quick and dirty fix is to make row zero bigger than the * rest, and put all the spares in it. This probably needs to get changed * eventually. * **************************************************************************/ int rf_ConfigureDisks(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, RF_Config_t *cfgPtr) { RF_RaidDisk_t *disks; RF_SectorCount_t min_numblks = (RF_SectorCount_t) 0x7FFFFFFFFFFFLL; RF_RowCol_t c; int bs, ret; unsigned i, count, foundone = 0, numFailuresThisRow; int force; force = cfgPtr->force; ret = rf_AllocDiskStructures(raidPtr, cfgPtr); if (ret) goto fail; disks = raidPtr->Disks; numFailuresThisRow = 0; for (c = 0; c < raidPtr->numCol; c++) { ret = rf_ConfigureDisk(raidPtr, &cfgPtr->devnames[0][c][0], &disks[c], c); if (ret) goto fail; if (disks[c].status == rf_ds_optimal) { ret = raidfetch_component_label(raidPtr, c); if (ret) goto fail; /* mark it as failed if the label looks bogus... */ if (!rf_reasonable_label(&raidPtr->raid_cinfo[c].ci_label,0) && !force) { disks[c].status = rf_ds_failed; } } if (disks[c].status != rf_ds_optimal) { numFailuresThisRow++; } else { if (disks[c].numBlocks < min_numblks) min_numblks = disks[c].numBlocks; DPRINTF6("Disk at col %d: dev %s numBlocks %" PRIu64 " blockSize %d (%ld MB)\n", c, disks[c].devname, disks[c].numBlocks, disks[c].blockSize, (long int) disks[c].numBlocks * disks[c].blockSize / 1024 / 1024); } } /* XXX fix for n-fault tolerant */ /* XXX this should probably check to see how many failures we can handle for this configuration! */ if (numFailuresThisRow > 0) raidPtr->status = rf_rs_degraded; /* all disks must be the same size & have the same block size, bs must * be a power of 2 */ bs = 0; foundone = 0; for (c = 0; c < raidPtr->numCol; c++) { if (disks[c].status == rf_ds_optimal) { bs = disks[c].blockSize; foundone = 1; break; } } if (!foundone) { RF_ERRORMSG("RAIDFRAME: Did not find any live disks in the array.\n"); ret = EINVAL; goto fail; } for (count = 0, i = 1; i; i <<= 1) if (bs & i) count++; if (count != 1) { RF_ERRORMSG1("Error: block size on disks (%d) must be a power of 2\n", bs); ret = EINVAL; goto fail; } if (rf_CheckLabels( raidPtr, cfgPtr )) { printf("raid%d: There were fatal errors\n", raidPtr->raidid); if (force != 0) { printf("raid%d: Fatal errors being ignored.\n", raidPtr->raidid); } else { ret = EINVAL; goto fail; } } for (c = 0; c < raidPtr->numCol; c++) { if (disks[c].status == rf_ds_optimal) { if (disks[c].blockSize != bs) { RF_ERRORMSG1("Error: block size of disk at c %d different from disk at c 0\n", c); ret = EINVAL; goto fail; } if (disks[c].numBlocks != min_numblks) { RF_ERRORMSG2("WARNING: truncating disk at c %d to %d blocks\n", c, (int) min_numblks); disks[c].numBlocks = min_numblks; } } } raidPtr->sectorsPerDisk = min_numblks; raidPtr->logBytesPerSector = ffs(bs) - 1; raidPtr->bytesPerSector = bs; raidPtr->sectorMask = bs - 1; return (0); fail: rf_UnconfigureVnodes( raidPtr ); return (ret); } /**************************************************************************** * set up the data structures describing the spare disks in the array * recall from the above comment that the spare disk descriptors are stored * in row zero, which is specially expanded to hold them. ****************************************************************************/ int rf_ConfigureSpareDisks(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, RF_Config_t *cfgPtr) { int i, ret; unsigned int bs; RF_RaidDisk_t *disks; int num_spares_done; num_spares_done = 0; /* The space for the spares should have already been allocated by * ConfigureDisks() */ disks = &raidPtr->Disks[raidPtr->numCol]; for (i = 0; i < raidPtr->numSpare; i++) { ret = rf_ConfigureDisk(raidPtr, &cfgPtr->spare_names[i][0], &disks[i], raidPtr->numCol + i); if (ret) goto fail; if (disks[i].status != rf_ds_optimal) { RF_ERRORMSG1("Warning: spare disk %s failed TUR\n", &cfgPtr->spare_names[i][0]); } else { disks[i].status = rf_ds_spare; /* change status to * spare */ DPRINTF6("Spare Disk %d: dev %s numBlocks %" PRIu64 " blockSize %d (%ld MB)\n", i, disks[i].devname, disks[i].numBlocks, disks[i].blockSize, (long int) disks[i].numBlocks * disks[i].blockSize / 1024 / 1024); } num_spares_done++; } /* check sizes and block sizes on spare disks */ bs = 1 << raidPtr->logBytesPerSector; for (i = 0; i < raidPtr->numSpare; i++) { if (disks[i].blockSize != bs) { RF_ERRORMSG3("Block size of %d on spare disk %s is not the same as on other disks (%d)\n", disks[i].blockSize, disks[i].devname, bs); ret = EINVAL; goto fail; } if (disks[i].numBlocks < raidPtr->sectorsPerDisk) { RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %" PRIu64 " blocks)\n", disks[i].devname, disks[i].blockSize, raidPtr->sectorsPerDisk); ret = EINVAL; goto fail; } else if (disks[i].numBlocks > raidPtr->sectorsPerDisk) { RF_ERRORMSG3("Warning: truncating spare disk %s to %" PRIu64 " blocks (from %" PRIu64 ")\n", disks[i].devname, raidPtr->sectorsPerDisk, disks[i].numBlocks); disks[i].numBlocks = raidPtr->sectorsPerDisk; } } return (0); fail: /* Release the hold on the main components. We've failed to allocate * a spare, and since we're failing, we need to free things.. XXX failing to allocate a spare is *not* that big of a deal... We *can* survive without it, if need be, esp. if we get hot adding working. If we don't fail out here, then we need a way to remove this spare... that should be easier to do here than if we are "live"... */ rf_UnconfigureVnodes( raidPtr ); return (ret); } static int rf_AllocDiskStructures(RF_Raid_t *raidPtr, RF_Config_t *cfgPtr) { int ret; size_t entries = raidPtr->numCol + RF_MAXSPARE; /* We allocate RF_MAXSPARE on the first row so that we have room to do hot-swapping of spares */ raidPtr->Disks = RF_MallocAndAdd( entries * sizeof(*raidPtr->Disks), raidPtr->cleanupList); if (raidPtr->Disks == NULL) { ret = ENOMEM; goto fail; } /* get space for device specific stuff.. */ raidPtr->raid_cinfo = RF_MallocAndAdd( entries * sizeof(*raidPtr->raid_cinfo), raidPtr->cleanupList); if (raidPtr->raid_cinfo == NULL) { ret = ENOMEM; goto fail; } raidPtr->abortRecon = RF_MallocAndAdd( entries * sizeof(int), raidPtr->cleanupList); if (raidPtr->abortRecon == NULL) { ret = ENOMEM; goto fail; } return(0); fail: rf_UnconfigureVnodes( raidPtr ); return(ret); } /* configure a single disk during auto-configuration at boot */ int rf_AutoConfigureDisks(RF_Raid_t *raidPtr, RF_Config_t *cfgPtr, RF_AutoConfig_t *auto_config) { RF_RaidDisk_t *disks; RF_RaidDisk_t *diskPtr; RF_RowCol_t c; RF_SectorCount_t min_numblks = (RF_SectorCount_t) 0x7FFFFFFFFFFFLL; int bs, ret; int numFailuresThisRow; RF_AutoConfig_t *ac; int parity_good; int mod_counter; int mod_counter_found; #if DEBUG printf("Starting autoconfiguration of RAID set...\n"); #endif ret = rf_AllocDiskStructures(raidPtr, cfgPtr); if (ret) goto fail; disks = raidPtr->Disks; /* assume the parity will be fine.. */ parity_good = RF_RAID_CLEAN; /* Check for mod_counters that are too low */ mod_counter_found = 0; mod_counter = 0; ac = auto_config; while(ac!=NULL) { if (mod_counter_found==0) { mod_counter = ac->clabel->mod_counter; mod_counter_found = 1; } else { if (ac->clabel->mod_counter > mod_counter) { mod_counter = ac->clabel->mod_counter; } } ac->flag = 0; /* clear the general purpose flag */ ac = ac->next; } bs = 0; numFailuresThisRow = 0; for (c = 0; c < raidPtr->numCol; c++) { diskPtr = &disks[c]; /* find this row/col in the autoconfig */ #if DEBUG printf("Looking for %d in autoconfig\n",c); #endif ac = auto_config; while(ac!=NULL) { if (ac->clabel==NULL) { /* big-time bad news. */ goto fail; } if ((ac->clabel->column == c) && (ac->clabel->mod_counter == mod_counter)) { /* it's this one... */ /* flag it as 'used', so we don't free it later. */ ac->flag = 1; #if DEBUG printf("Found: %s at %d\n", ac->devname,c); #endif break; } ac=ac->next; } if (ac==NULL) { /* we didn't find an exact match with a correct mod_counter above... can we find one with an incorrect mod_counter to use instead? (this one, if we find it, will be marked as failed once the set configures) */ ac = auto_config; while(ac!=NULL) { if (ac->clabel==NULL) { /* big-time bad news. */ goto fail; } if (ac->clabel->column == c) { /* it's this one... flag it as 'used', so we don't free it later. */ ac->flag = 1; #if DEBUG printf("Found(low mod_counter): %s at %d\n", ac->devname,c); #endif break; } ac=ac->next; } } if (ac!=NULL) { /* Found it. Configure it.. */ diskPtr->blockSize = ac->clabel->blockSize; diskPtr->numBlocks = rf_component_label_numblocks(ac->clabel); /* Note: rf_protectedSectors is already factored into numBlocks here */ raidPtr->raid_cinfo[c].ci_vp = ac->vp; raidPtr->raid_cinfo[c].ci_dev = ac->dev; memcpy(raidget_component_label(raidPtr, c), ac->clabel, sizeof(*ac->clabel)); snprintf(diskPtr->devname, sizeof(diskPtr->devname), "/dev/%s", ac->devname); /* note the fact that this component was autoconfigured. You'll need this info later. Trust me :) */ diskPtr->auto_configured = 1; diskPtr->dev = ac->dev; /* * we allow the user to specify that * only a fraction of the disks should * be used this is just for debug: it * speeds up the parity scan */ diskPtr->numBlocks = diskPtr->numBlocks * rf_sizePercentage / 100; /* XXX these will get set multiple times, but since we're autoconfiguring, they'd better be always the same each time! If not, this is the least of your worries */ bs = diskPtr->blockSize; min_numblks = diskPtr->numBlocks; /* this gets done multiple times, but that's fine -- the serial number will be the same for all components, guaranteed */ raidPtr->serial_number = ac->clabel->serial_number; /* check the last time the label was modified */ if (ac->clabel->mod_counter != mod_counter) { /* Even though we've filled in all of the above, we don't trust this component since its modification counter is not in sync with the rest, and we really consider it to be failed. */ disks[c].status = rf_ds_failed; numFailuresThisRow++; } else { if (ac->clabel->clean != RF_RAID_CLEAN) { parity_good = RF_RAID_DIRTY; } } } else { /* Didn't find it at all!! Component must really be dead */ disks[c].status = rf_ds_failed; snprintf(disks[c].devname, sizeof(disks[c].devname), "component%d", c); numFailuresThisRow++; } } /* XXX fix for n-fault tolerant */ /* XXX this should probably check to see how many failures we can handle for this configuration! */ if (numFailuresThisRow > 0) { raidPtr->status = rf_rs_degraded; raidPtr->numFailures = numFailuresThisRow; } /* close the device for the ones that didn't get used */ ac = auto_config; while(ac!=NULL) { if (ac->flag == 0) { vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY); VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED); vput(ac->vp); ac->vp = NULL; #if DEBUG printf("Released %s from auto-config set.\n", ac->devname); #endif } ac = ac->next; } raidPtr->mod_counter = mod_counter; /* note the state of the parity, if any */ raidPtr->parity_good = parity_good; raidPtr->sectorsPerDisk = min_numblks; raidPtr->logBytesPerSector = ffs(bs) - 1; raidPtr->bytesPerSector = bs; raidPtr->sectorMask = bs - 1; return (0); fail: rf_UnconfigureVnodes( raidPtr ); return (ret); } /* configure a single disk in the array */ int rf_ConfigureDisk(RF_Raid_t *raidPtr, char *bf, RF_RaidDisk_t *diskPtr, RF_RowCol_t col) { char *p; struct pathbuf *pb; struct vnode *vp; int error; p = rf_find_non_white(bf); if (p[strlen(p) - 1] == '\n') { /* strip off the newline */ p[strlen(p) - 1] = '\0'; } (void) strcpy(diskPtr->devname, p); /* Let's start by claiming the component is fine and well... */ diskPtr->status = rf_ds_optimal; raidPtr->raid_cinfo[col].ci_vp = NULL; raidPtr->raid_cinfo[col].ci_dev = 0; if (!strcmp("absent", diskPtr->devname)) { printf("Ignoring missing component at column %d\n", col); snprintf(diskPtr->devname, sizeof(diskPtr->devname), "component%d", col); diskPtr->status = rf_ds_failed; return (0); } pb = pathbuf_create(diskPtr->devname); if (pb == NULL) { printf("pathbuf_create for device: %s failed!\n", diskPtr->devname); return ENOMEM; } error = vn_bdev_openpath(pb, &vp, curlwp); pathbuf_destroy(pb); if (error) { printf("open device: '%s' failed: %d\n", diskPtr->devname, error); if (error == ENXIO) { /* the component isn't there... must be dead :-( */ diskPtr->status = rf_ds_failed; return 0; } else { return (error); } } if ((error = rf_getdisksize(vp, diskPtr)) != 0) return (error); /* * If this raidPtr's bytesPerSector is zero, fill it in with this * components blockSize. This will give us something to work with * initially, and if it is wrong, we'll get errors later. */ if (raidPtr->bytesPerSector == 0) raidPtr->bytesPerSector = diskPtr->blockSize; if (diskPtr->status == rf_ds_optimal) { raidPtr->raid_cinfo[col].ci_vp = vp; raidPtr->raid_cinfo[col].ci_dev = vp->v_rdev; /* This component was not automatically configured */ diskPtr->auto_configured = 0; diskPtr->dev = vp->v_rdev; /* we allow the user to specify that only a fraction of the * disks should be used this is just for debug: it speeds up * the parity scan */ diskPtr->numBlocks = diskPtr->numBlocks * rf_sizePercentage / 100; } return (0); } static void rf_print_label_status(RF_Raid_t *raidPtr, int column, char *dev_name, RF_ComponentLabel_t *ci_label) { printf("raid%d: Component %s being configured at col: %d\n", raidPtr->raidid, dev_name, column ); printf(" Column: %d Num Columns: %d\n", ci_label->column, ci_label->num_columns); printf(" Version: %d Serial Number: %d Mod Counter: %d\n", ci_label->version, ci_label->serial_number, ci_label->mod_counter); printf(" Clean: %s Status: %d\n", ci_label->clean ? "Yes" : "No", ci_label->status ); } static int rf_check_label_vitals(RF_Raid_t *raidPtr, int row, int column, char *dev_name, RF_ComponentLabel_t *ci_label, int serial_number, int mod_counter) { int fatal_error = 0; if (serial_number != ci_label->serial_number) { printf("%s has a different serial number: %d %d\n", dev_name, serial_number, ci_label->serial_number); fatal_error = 1; } if (mod_counter != ci_label->mod_counter) { printf("%s has a different modification count: %d %d\n", dev_name, mod_counter, ci_label->mod_counter); } if (row != ci_label->row) { printf("Row out of alignment for: %s\n", dev_name); fatal_error = 1; } if (column != ci_label->column) { printf("Column out of alignment for: %s\n", dev_name); fatal_error = 1; } if (raidPtr->numCol != ci_label->num_columns) { printf("Number of columns do not match for: %s\n", dev_name); fatal_error = 1; } if (ci_label->clean == 0) { /* it's not clean, but that's not fatal */ printf("%s is not clean!\n", dev_name); } return(fatal_error); } static void rf_handle_hosed(RF_Raid_t *raidPtr, RF_Config_t *cfgPtr, int hosed_column, int again) { printf("Hosed component: %s\n", &cfgPtr->devnames[0][hosed_column][0]); if (cfgPtr->force) return; /* we'll fail this component, as if there are other major errors, we aren't forcing things and we'll abort the config anyways */ if (again && raidPtr->Disks[hosed_column].status == rf_ds_failed) return; raidPtr->Disks[hosed_column].status = rf_ds_failed; raidPtr->numFailures++; raidPtr->status = rf_rs_degraded; } /* rf_CheckLabels() - check all the component labels for consistency. Return an error if there is anything major amiss. */ int rf_CheckLabels(RF_Raid_t *raidPtr, RF_Config_t *cfgPtr) { int c; char *dev_name; RF_ComponentLabel_t *ci_label; int serial_number = 0; int mod_number = 0; int fatal_error = 0; int mod_values[4]; int mod_count[4]; int ser_values[4]; int ser_count[4]; int num_ser; int num_mod; int i; int found; int hosed_column; int too_fatal; int parity_good; hosed_column = -1; too_fatal = 0; /* We're going to try to be a little intelligent here. If one component's label is bogus, and we can identify that it's the *only* one that's gone, we'll mark it as "failed" and allow the configuration to proceed. This will be the *only* case that we'll proceed if there would be (otherwise) fatal errors. Basically we simply keep a count of how many components had what serial number. If all but one agree, we simply mark the disagreeing component as being failed, and allow things to come up "normally". We do this first for serial numbers, and then for "mod_counter". */ num_ser = 0; num_mod = 0; ser_values[0] = ser_values[1] = ser_values[2] = ser_values[3] = 0; ser_count[0] = ser_count[1] = ser_count[2] = ser_count[3] = 0; mod_values[0] = mod_values[1] = mod_values[2] = mod_values[3] = 0; mod_count[0] = mod_count[1] = mod_count[2] = mod_count[3] = 0; for (c = 0; c < raidPtr->numCol; c++) { if (raidPtr->Disks[c].status != rf_ds_optimal) continue; ci_label = raidget_component_label(raidPtr, c); found=0; for(i=0;iserial_number) { ser_count[i]++; found=1; break; } } if (!found) { ser_values[num_ser] = ci_label->serial_number; ser_count[num_ser] = 1; num_ser++; if (num_ser>2) { fatal_error = 1; break; } } found=0; for(i=0;imod_counter) { mod_count[i]++; found=1; break; } } if (!found) { mod_values[num_mod] = ci_label->mod_counter; mod_count[num_mod] = 1; num_mod++; if (num_mod>2) { fatal_error = 1; break; } } } #if DEBUG printf("raid%d: Summary of serial numbers:\n", raidPtr->raidid); for(i=0;iraidid); for(i=0;i ser_count[0]) { serial_number = ser_values[1]; } for (c = 0; c < raidPtr->numCol; c++) { if (raidPtr->Disks[c].status != rf_ds_optimal) continue; ci_label = raidget_component_label(raidPtr, c); if (serial_number != ci_label->serial_number) { hosed_column = c; break; } } if (hosed_column != -1) rf_handle_hosed(raidPtr, cfgPtr, hosed_column, 0); } else { too_fatal = 1; } if (cfgPtr->parityConfig == '0') { /* We've identified two different serial numbers. RAID 0 can't cope with that, so we'll punt */ too_fatal = 1; } } /* record the serial number for later. If we bail later, setting this doesn't matter, otherwise we've got the best guess at the correct serial number */ raidPtr->serial_number = serial_number; mod_number = mod_values[0]; if (num_mod == 2) { if ((mod_count[0] == 1) || (mod_count[1] == 1)) { /* Locate the maverick component */ if (mod_count[1] > mod_count[0]) { mod_number = mod_values[1]; } else if (mod_count[1] < mod_count[0]) { mod_number = mod_values[0]; } else { /* counts of different modification values are the same. Assume greater value is the correct one, all other things considered */ if (mod_values[0] > mod_values[1]) { mod_number = mod_values[0]; } else { mod_number = mod_values[1]; } } for (c = 0; c < raidPtr->numCol; c++) { if (raidPtr->Disks[c].status != rf_ds_optimal) continue; ci_label = raidget_component_label(raidPtr, c); if (mod_number != ci_label->mod_counter) { if (hosed_column == c) { /* same one. Can deal with it. */ } else { hosed_column = c; if (num_ser != 1) { too_fatal = 1; break; } } } } if (hosed_column != -1) rf_handle_hosed(raidPtr, cfgPtr, hosed_column, 1); } else { too_fatal = 1; } if (cfgPtr->parityConfig == '0') { /* We've identified two different mod counters. RAID 0 can't cope with that, so we'll punt */ too_fatal = 1; } } raidPtr->mod_counter = mod_number; if (too_fatal) { /* we've had both a serial number mismatch, and a mod_counter mismatch -- and they involved two different components!! Bail -- make things fail so that the user must force the issue... */ hosed_column = -1; fatal_error = 1; } if (num_ser > 2) { printf("raid%d: Too many different serial numbers!\n", raidPtr->raidid); fatal_error = 1; } if (num_mod > 2) { printf("raid%d: Too many different mod counters!\n", raidPtr->raidid); fatal_error = 1; } for (c = 0; c < raidPtr->numCol; c++) { if (raidPtr->Disks[c].status != rf_ds_optimal) { hosed_column = c; break; } } /* we start by assuming the parity will be good, and flee from that notion at the slightest sign of trouble */ parity_good = RF_RAID_CLEAN; for (c = 0; c < raidPtr->numCol; c++) { dev_name = &cfgPtr->devnames[0][c][0]; ci_label = raidget_component_label(raidPtr, c); if (c == hosed_column) { printf("raid%d: Ignoring %s\n", raidPtr->raidid, dev_name); } else { rf_print_label_status( raidPtr, c, dev_name, ci_label); if (rf_check_label_vitals( raidPtr, 0, c, dev_name, ci_label, serial_number, mod_number )) { fatal_error = 1; } if (ci_label->clean != RF_RAID_CLEAN) { parity_good = RF_RAID_DIRTY; } } } if (fatal_error) { parity_good = RF_RAID_DIRTY; } /* we note the state of the parity */ raidPtr->parity_good = parity_good; return(fatal_error); } int rf_add_hot_spare(RF_Raid_t *raidPtr, RF_SingleComponent_t *sparePtr) { RF_DiskQueue_t *spareQueues; RF_RaidDisk_t *disks; int ret; unsigned int bs; int spare_number; ret=0; if (raidPtr->numSpare >= RF_MAXSPARE) { RF_ERRORMSG1("Too many spares: %d\n", raidPtr->numSpare); return(EINVAL); } rf_lock_mutex2(raidPtr->mutex); while (raidPtr->changing_components == 1) { rf_wait_cond2(raidPtr->changing_components_cv, raidPtr->mutex); } raidPtr->changing_components = 1; rf_unlock_mutex2(raidPtr->mutex); /* the beginning of the spares... */ disks = &raidPtr->Disks[raidPtr->numCol]; spare_number = raidPtr->numSpare; ret = rf_ConfigureDisk(raidPtr, sparePtr->component_name, &disks[spare_number], raidPtr->numCol + spare_number); if (ret) goto fail; if (disks[spare_number].status != rf_ds_optimal) { RF_ERRORMSG1("Warning: spare disk %s failed TUR\n", sparePtr->component_name); rf_close_component(raidPtr, raidPtr->raid_cinfo[raidPtr->numCol+spare_number].ci_vp, 0); ret=EINVAL; goto fail; } else { disks[spare_number].status = rf_ds_spare; DPRINTF6("Spare Disk %d: dev %s numBlocks %" PRIu64 " blockSize %d (%ld MB)\n", spare_number, disks[spare_number].devname, disks[spare_number].numBlocks, disks[spare_number].blockSize, (long int) disks[spare_number].numBlocks * disks[spare_number].blockSize / 1024 / 1024); } /* check sizes and block sizes on the spare disk */ bs = 1 << raidPtr->logBytesPerSector; if (disks[spare_number].blockSize != bs) { RF_ERRORMSG3("Block size of %d on spare disk %s is not the same as on other disks (%d)\n", disks[spare_number].blockSize, disks[spare_number].devname, bs); rf_close_component(raidPtr, raidPtr->raid_cinfo[raidPtr->numCol+spare_number].ci_vp, 0); ret = EINVAL; goto fail; } if (disks[spare_number].numBlocks < raidPtr->sectorsPerDisk) { RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %" PRIu64 " blocks)\n", disks[spare_number].devname, disks[spare_number].blockSize, raidPtr->sectorsPerDisk); rf_close_component(raidPtr, raidPtr->raid_cinfo[raidPtr->numCol+spare_number].ci_vp, 0); ret = EINVAL; goto fail; } else { if (disks[spare_number].numBlocks > raidPtr->sectorsPerDisk) { RF_ERRORMSG3("Warning: truncating spare disk %s to %" PRIu64 " blocks (from %" PRIu64 ")\n", disks[spare_number].devname, raidPtr->sectorsPerDisk, disks[spare_number].numBlocks); disks[spare_number].numBlocks = raidPtr->sectorsPerDisk; } } /* * We only grow one initialized diskQueue at a time * spare_number can be lower than raidPtr->maxQueue (update) * or they can be equal (initialize new queue) */ RF_ASSERT(spare_number <= raidPtr->maxQueue); spareQueues = &raidPtr->Queues[raidPtr->numCol]; if (spare_number == raidPtr->maxQueue) { ret = rf_ConfigureDiskQueue(raidPtr, &spareQueues[spare_number], raidPtr->numCol + spare_number, raidPtr->qType, raidPtr->sectorsPerDisk, raidPtr->Disks[raidPtr->numCol + spare_number].dev, raidPtr->maxOutstanding, &raidPtr->shutdownList, raidPtr->cleanupList); if (ret) goto fail; rf_lock_mutex2(raidPtr->mutex); raidPtr->maxQueue++; rf_unlock_mutex2(raidPtr->mutex); } else { (void)rf_UpdateDiskQueue(&spareQueues[spare_number], &disks[spare_number]); } fail: rf_lock_mutex2(raidPtr->mutex); if (ret == 0) { raidPtr->numSpare++; } raidPtr->changing_components = 0; rf_signal_cond2(raidPtr->changing_components_cv); rf_unlock_mutex2(raidPtr->mutex); return(ret); } int rf_remove_hot_spare(RF_Raid_t *raidPtr, RF_SingleComponent_t *sparePtr) { int spare_number; int i; RF_RaidDisk_t *disk; struct vnode *vp; int ret = EINVAL; spare_number = sparePtr->column - raidPtr->numCol; if (spare_number < 0 || spare_number > raidPtr->numSpare) return(ret); rf_lock_mutex2(raidPtr->mutex); while (raidPtr->changing_components == 1) { rf_wait_cond2(raidPtr->changing_components_cv, raidPtr->mutex); } raidPtr->changing_components = 1; rf_unlock_mutex2(raidPtr->mutex); rf_SuspendNewRequestsAndWait(raidPtr); disk = &raidPtr->Disks[raidPtr->numCol + spare_number]; if (disk->status != rf_ds_spare && disk->status != rf_ds_failed) { printf("Spare is in use %d\n", disk->status); ret = EBUSY; goto out; } vp = raidPtr->raid_cinfo[raidPtr->numCol + spare_number].ci_vp; raidPtr->raid_cinfo[raidPtr->numCol + spare_number].ci_vp = NULL; raidPtr->raid_cinfo[raidPtr->numCol + spare_number].ci_dev = 0; /* This component was not automatically configured */ disk->auto_configured = 0; disk->dev = 0; disk->numBlocks = 0; disk->status = rf_ds_failed; snprintf(disk->devname, sizeof(disk->devname), "absent_spare%d", spare_number); rf_close_component(raidPtr, vp, 0); rf_lock_mutex2(raidPtr->mutex); /* at this point we know spare_number is to be pushed all the way to the end of the array... */ for (i = raidPtr->numCol + spare_number; i < raidPtr->numCol+raidPtr->numSpare-1; i++) { /* now we work our way up the spare array, swaping the current one for the next one */ rf_swap_components(raidPtr, i, i+1); } raidPtr->numSpare--; rf_unlock_mutex2(raidPtr->mutex); rf_ResumeNewRequests(raidPtr); ret = 0; out: rf_lock_mutex2(raidPtr->mutex); raidPtr->changing_components = 0; rf_signal_cond2(raidPtr->changing_components_cv); rf_unlock_mutex2(raidPtr->mutex); return(ret); } /* * Delete a non hot spare component */ int rf_delete_component(RF_Raid_t *raidPtr, RF_SingleComponent_t *component) { RF_RaidDisk_t *disk; RF_RowCol_t col = component->column; struct vnode *vp; int ret = EINVAL; if (col < 0 || col >= raidPtr->numCol) return(ret); rf_lock_mutex2(raidPtr->mutex); while (raidPtr->changing_components == 1) { rf_wait_cond2(raidPtr->changing_components_cv, raidPtr->mutex); } raidPtr->changing_components = 1; rf_unlock_mutex2(raidPtr->mutex); disk = &raidPtr->Disks[col]; /* 1. This component must be marked as failed or spared */ switch (disk->status) { case rf_ds_failed: case rf_ds_dist_spared: case rf_ds_spared: break; default: ret = EBUSY; goto out; } vp = raidPtr->raid_cinfo[col].ci_vp; raidPtr->raid_cinfo[col].ci_vp = NULL; raidPtr->raid_cinfo[col].ci_dev = 0; /* This component was not automatically configured */ disk->auto_configured = 0; disk->dev = 0; disk->numBlocks = 0; snprintf(disk->devname, sizeof(disk->devname), "component%d", col); rf_close_component(raidPtr, vp, 0); ret = 0; out: rf_lock_mutex2(raidPtr->mutex); raidPtr->changing_components = 0; rf_signal_cond2(raidPtr->changing_components_cv); rf_unlock_mutex2(raidPtr->mutex); return(ret); } int rf_remove_component(RF_Raid_t *raidPtr, RF_SingleComponent_t *component) { RF_RowCol_t col = component->column; if (col < 0 || col >= raidPtr->numCol + raidPtr->numSpare) return(EINVAL); if (col >= raidPtr->numCol) return rf_remove_hot_spare(raidPtr, component); else return rf_delete_component(raidPtr, component); } int rf_incorporate_hot_spare(RF_Raid_t *raidPtr, RF_SingleComponent_t *component) { /* Issues here include how to 'move' this in if there is IO taking place (e.g. component queues and such) */ return(EINVAL); /* Not implemented yet. */ } void rf_swap_components(RF_Raid_t *raidPtr, int a, int b) { char tmpdevname[56]; /* 56 is from raidframevar.h */ RF_ComponentLabel_t tmp_ci_label; dev_t tmp_ci_dev, tmp_dev; int tmp_status; struct vnode *tmp_ci_vp; /* This function *MUST* be called with all IO suspended. */ RF_ASSERT(raidPtr->accesses_suspended == 0); /* Swap the component names... */ snprintf(tmpdevname, sizeof(tmpdevname), "%s", raidPtr->Disks[a].devname); snprintf(raidPtr->Disks[a].devname, sizeof(raidPtr->Disks[a].devname), "%s", raidPtr->Disks[b].devname); snprintf(raidPtr->Disks[b].devname, sizeof(raidPtr->Disks[b].devname), "%s", tmpdevname); /* and the vp */ tmp_ci_vp = raidPtr->raid_cinfo[a].ci_vp; raidPtr->raid_cinfo[a].ci_vp = raidPtr->raid_cinfo[b].ci_vp; raidPtr->raid_cinfo[b].ci_vp = tmp_ci_vp; /* and the ci dev */ tmp_ci_dev = raidPtr->raid_cinfo[a].ci_dev; raidPtr->raid_cinfo[a].ci_dev = raidPtr->raid_cinfo[b].ci_dev; raidPtr->raid_cinfo[b].ci_dev = tmp_ci_dev; /* the dev itself */ tmp_dev = raidPtr->Disks[a].dev; raidPtr->Disks[a].dev = raidPtr->Disks[b].dev; raidPtr->Disks[b].dev = tmp_dev; /* the component label */ tmp_ci_label = raidPtr->raid_cinfo[a].ci_label; raidPtr->raid_cinfo[a].ci_label = raidPtr->raid_cinfo[b].ci_label; raidPtr->raid_cinfo[b].ci_label = tmp_ci_label; /* and the status */ tmp_status = raidPtr->Disks[a].status; raidPtr->Disks[a].status = raidPtr->Disks[b].status; raidPtr->Disks[b].status = tmp_status; }