Make it possible to rebuild degraded RAID5 plexes. Note that it is

currently not possible to do this while the volume is mounted.

MFC in:  1 week
This commit is contained in:
Lukas Ertl 2004-09-30 12:57:35 +00:00
parent af9cb375e8
commit c3aadfb9d6
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=135966
6 changed files with 261 additions and 16 deletions

View file

@ -43,6 +43,8 @@ __FBSDID("$FreeBSD$");
int gv_init_plex(struct gv_plex *);
int gv_init_sd(struct gv_sd *);
void gv_init_td(void *);
void gv_rebuild_plex(struct gv_plex *);
void gv_rebuild_td(void *);
void gv_start_plex(struct gv_plex *);
void gv_start_vol(struct gv_volume *);
void gv_sync(struct gv_volume *);
@ -117,8 +119,12 @@ gv_start_plex(struct gv_plex *p)
v = p->vol_sc;
if ((v != NULL) && (v->plexcount > 1))
gv_sync(v);
else if (p->org == GV_PLEX_RAID5)
gv_init_plex(p);
else if (p->org == GV_PLEX_RAID5) {
if (p->state == GV_PLEX_DEGRADED)
gv_rebuild_plex(p);
else
gv_init_plex(p);
}
return;
}
@ -142,7 +148,9 @@ gv_start_vol(struct gv_volume *v)
case GV_PLEX_DOWN:
gv_init_plex(p);
break;
case GV_PLEX_DEGRADED: /* XXX not yet */
case GV_PLEX_DEGRADED:
gv_rebuild_plex(p);
break;
default:
return;
}
@ -191,6 +199,22 @@ gv_sync(struct gv_volume *v)
}
}
void
gv_rebuild_plex(struct gv_plex *p)
{
struct gv_sync_args *sync;
if ((p->flags & GV_PLEX_SYNCING) || gv_is_open(p->geom))
return;
sync = g_malloc(sizeof(*sync), M_WAITOK | M_ZERO);
sync->to = p;
sync->syncsize = GV_DFLT_SYNCSIZE;
kthread_create(gv_rebuild_td, sync, NULL, 0, 0, "gv_rebuild %s",
p->name);
}
int
gv_init_plex(struct gv_plex *p)
{
@ -225,6 +249,94 @@ gv_init_sd(struct gv_sd *s)
return (0);
}
/* This thread is responsible for rebuilding a degraded RAID5 plex. */
void
gv_rebuild_td(void *arg)
{
struct bio *bp;
struct gv_plex *p;
struct g_consumer *cp;
struct gv_sync_args *sync;
u_char *buf;
off_t i;
int error;
buf = NULL;
bp = NULL;
sync = arg;
p = sync->to;
p->synced = 0;
p->flags |= GV_PLEX_SYNCING;
cp = p->consumer;
g_topology_lock();
error = g_access(cp, 1, 1, 0);
if (error) {
g_topology_unlock();
printf("GEOM_VINUM: rebuild of %s failed to access consumer: "
"%d\n", p->name, error);
kthread_exit(error);
}
g_topology_unlock();
buf = g_malloc(sync->syncsize, M_WAITOK);
printf("GEOM_VINUM: rebuild of %s started\n", p->name);
i = 0;
for (i = 0; i < p->size; i += (p->stripesize * (p->sdcount - 1))) {
/*
if (i + sync->syncsize > p->size)
sync->syncsize = p->size - i;
*/
bp = g_new_bio();
if (bp == NULL) {
printf("GEOM_VINUM: rebuild of %s failed creating bio: "
"out of memory\n", p->name);
break;
}
bp->bio_cmd = BIO_WRITE;
bp->bio_done = NULL;
bp->bio_data = buf;
bp->bio_cflags |= GV_BIO_REBUILD;
bp->bio_offset = i;
bp->bio_length = p->stripesize;
/* Schedule it down ... */
g_io_request(bp, cp);
/* ... and wait for the result. */
error = biowait(bp, "gwrite");
if (error) {
printf("GEOM_VINUM: rebuild of %s failed at offset %jd "
"errno: %d\n", p->name, i, error);
break;
}
g_destroy_bio(bp);
bp = NULL;
}
if (bp != NULL)
g_destroy_bio(bp);
if (buf != NULL)
g_free(buf);
g_topology_lock();
g_access(cp, -1, -1, 0);
gv_save_config_all(p->vinumconf);
g_topology_unlock();
p->flags &= ~GV_PLEX_SYNCING;
p->synced = 0;
/* Successful initialization. */
if (!error)
printf("GEOM_VINUM: rebuild of %s finished\n", p->name);
g_free(sync);
kthread_exit(error);
}
void
gv_sync_td(void *arg)
{

View file

@ -365,9 +365,15 @@ gv_lsi(struct gv_sd *s, struct sbuf *sb, int flags)
(intmax_t)s->size, (intmax_t)s->size / MEGABYTE);
sbuf_printf(sb, "\t\tState: %s\n", gv_sdstate(s->state));
if (s->state == GV_SD_INITIALIZING) {
sbuf_printf(sb, "\t\tInitialized: %16jd bytes "
"(%d%%)\n", (intmax_t)s->initialized,
if (s->state == GV_SD_INITIALIZING ||
s->state == GV_SD_REVIVING) {
if (s->state == GV_SD_INITIALIZING)
sbuf_printf(sb, "\t\tInitialized: ");
else
sbuf_printf(sb, "\t\tRevived: ");
sbuf_printf(sb, "%16jd bytes (%d%%)\n",
(intmax_t)s->initialized,
(int)((s->initialized * 100) / s->size));
}
@ -377,20 +383,20 @@ gv_lsi(struct gv_sd *s, struct sbuf *sb, int flags)
gv_roughlength(s->plex_offset, 1));
}
if (s->state == GV_SD_REVIVING) {
/* XXX */
}
sbuf_printf(sb, "\t\tDrive %s (%s) at offset %jd (%s)\n",
s->drive,
s->drive_sc == NULL ? "*missing*" : s->drive_sc->name,
(intmax_t)s->drive_offset,
gv_roughlength(s->drive_offset, 1));
} else {
/* XXX reviving and initializing... */
sbuf_printf(sb, "S %-21s State: ", s->name);
if (s->state == GV_SD_INITIALIZING) {
sbuf_printf(sb, "I %d%%\t",
if (s->state == GV_SD_INITIALIZING ||
s->state == GV_SD_REVIVING) {
if (s->state == GV_SD_INITIALIZING)
sbuf_printf(sb, "I ");
else
sbuf_printf(sb, "R ");
sbuf_printf(sb, "%d%%\t",
(int)((s->initialized * 100) / s->size));
} else {
sbuf_printf(sb, "%s\t", gv_sdstate(s->state));

View file

@ -295,7 +295,9 @@ gv_plex_worker(void *arg)
/* A completed request. */
if (bp->bio_cflags & GV_BIO_DONE) {
g_free(bq);
if (bp->bio_cflags & GV_BIO_SYNCREQ) {
if (bp->bio_cflags & GV_BIO_SYNCREQ ||
bp->bio_cflags & GV_BIO_REBUILD) {
s = bp->bio_to->private;
if (bp->bio_error == 0)
s->initialized += bp->bio_length;
@ -306,8 +308,11 @@ gv_plex_worker(void *arg)
g_topology_unlock();
s->initialized = 0;
}
}
if (bp->bio_cflags & GV_BIO_SYNCREQ)
g_std_done(bp);
} else
else
gv_plex_completed_request(p, bp);
/*
* A sub-request that was hold back because it interfered with
@ -457,7 +462,12 @@ gv_plex_normal_request(struct gv_plex *p, struct bio *bp)
wp->bio = bp;
TAILQ_INIT(&wp->bits);
err = gv_build_raid5_req(p, wp, bp, addr, boff, bcount);
if (bp->bio_cflags & GV_BIO_REBUILD)
err = gv_rebuild_raid5(p, wp, bp, addr,
boff, bcount);
else
err = gv_build_raid5_req(p, wp, bp, addr,
boff, bcount);
/*
* Building the sub-request failed, we probably need to

View file

@ -77,6 +77,117 @@ gv_stripe_active(struct gv_plex *p, struct bio *bp)
return (overlap);
}
int
gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
caddr_t addr, off_t boff, off_t bcount)
{
struct gv_sd *broken, *s;
struct gv_bioq *bq;
struct bio *cbp, *pbp;
off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart;
if (p == NULL || LIST_EMPTY(&p->subdisks))
return (ENXIO);
/* Offset of the start address from the start of the stripe. */
stripeoff = boff % (p->stripesize * (p->sdcount - 1));
KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0"));
/* The offset of the stripe on this subdisk. */
stripestart = (boff - stripeoff) / (p->sdcount - 1);
KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0"));
stripeoff %= p->stripesize;
/* The offset of the request on this subdisk. */
real_off = stripestart + stripeoff;
stripeend = stripestart + p->stripesize;
len_left = stripeend - real_off;
KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0"));
/* Find the right subdisk. */
broken = NULL;
LIST_FOREACH(s, &p->subdisks, in_plex) {
if (s->state != GV_SD_UP)
broken = s;
}
/* Parity stripe not found. */
if (broken == NULL)
return (ENXIO);
switch (broken->state) {
case GV_SD_UP:
return (EINVAL);
case GV_SD_STALE:
if (!(bp->bio_cflags & GV_BIO_REBUILD))
return (ENXIO);
printf("GEOM_VINUM: sd %s is reviving\n", broken->name);
gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
break;
case GV_SD_REVIVING:
break;
default:
/* All other subdisk states mean it's not accessible. */
return (ENXIO);
}
real_len = (bcount <= len_left) ? bcount : len_left;
wp->length = real_len;
wp->data = addr;
wp->lockbase = real_off;
KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
/* Read all subdisks. */
LIST_FOREACH(s, &p->subdisks, in_plex) {
/* Skip the broken subdisk. */
if (s == broken)
continue;
cbp = g_clone_bio(bp);
if (cbp == NULL)
return (ENOMEM);
cbp->bio_cmd = BIO_READ;
cbp->bio_data = g_malloc(real_len, M_WAITOK);
cbp->bio_cflags |= GV_BIO_MALLOC;
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_done = gv_plex_done;
cbp->bio_caller2 = s->consumer;
cbp->bio_driver1 = wp;
GV_ENQUEUE(bp, cbp, pbp);
bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
bq->bp = cbp;
TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
}
/* Write the parity data. */
cbp = g_clone_bio(bp);
if (cbp == NULL)
return (ENOMEM);
cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
cbp->bio_cflags |= GV_BIO_MALLOC;
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_done = gv_plex_done;
cbp->bio_caller2 = broken->consumer;
cbp->bio_driver1 = wp;
cbp->bio_cflags |= GV_BIO_REBUILD;
wp->parity = cbp;
p->synced = boff;
return (0);
}
/* Build a request group to perform (part of) a RAID5 request. */
int
gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
@ -166,6 +277,9 @@ gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced))
type = REQ_TYPE_NORMAL;
switch (bp->bio_cmd) {
case BIO_READ:
/*

View file

@ -67,6 +67,8 @@ struct gv_raid5_packet {
int gv_stripe_active(struct gv_plex *, struct bio *);
int gv_build_raid5_req(struct gv_plex *, struct gv_raid5_packet *,
struct bio *, caddr_t, off_t, off_t);
int gv_rebuild_raid5(struct gv_plex *, struct gv_raid5_packet *,
struct bio *, caddr_t, off_t, off_t);
void gv_raid5_worker(void *);
void gv_plex_done(struct bio *);

View file

@ -113,6 +113,7 @@
#define GV_BIO_ONHOLD 0x04
#define GV_BIO_SYNCREQ 0x08
#define GV_BIO_SUCCEED 0x10
#define GV_BIO_REBUILD 0x20
/*
* hostname is 256 bytes long, but we don't need to shlep multiple copies in