aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig18
-rw-r--r--drivers/md/bitmap.c17
-rw-r--r--drivers/md/dm-crypt.c506
-rw-r--r--drivers/md/dm-exception-store.c176
-rw-r--r--drivers/md/dm-linear.c19
-rw-r--r--drivers/md/dm-mpath.c83
-rw-r--r--drivers/md/dm-raid1.c4
-rw-r--r--drivers/md/dm-snap.c351
-rw-r--r--drivers/md/dm-snap.h17
-rw-r--r--drivers/md/dm-table.c109
-rw-r--r--drivers/md/dm.c113
-rw-r--r--drivers/md/dm.h7
-rw-r--r--drivers/md/linear.c15
-rw-r--r--drivers/md/md.c278
-rw-r--r--drivers/md/multipath.c27
-rw-r--r--drivers/md/raid0.c17
-rw-r--r--drivers/md/raid1.c247
-rw-r--r--drivers/md/raid10.c261
-rw-r--r--drivers/md/raid5.c76
19 files changed, 1387 insertions, 954 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 6dd31a291d843..c92c1521546df 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -138,16 +138,16 @@ config MD_RAID456
If unsure, say Y.
config MD_RAID5_RESHAPE
- bool "Support adding drives to a raid-5 array (experimental)"
- depends on MD_RAID456 && EXPERIMENTAL
+ bool "Support adding drives to a raid-5 array"
+ depends on MD_RAID456
+ default y
---help---
A RAID-5 set can be expanded by adding extra drives. This
requires "restriping" the array which means (almost) every
block must be written to a different place.
This option allows such restriping to be done while the array
- is online. However it is still EXPERIMENTAL code. It should
- work, but please be sure that you have backups.
+ is online.
You will need mdadm version 2.4.1 or later to use this
feature safely. During the early stage of reshape there is
@@ -164,6 +164,8 @@ config MD_RAID5_RESHAPE
There should be enough spares already present to make the new
array workable.
+ If unsure, say Y.
+
config MD_MULTIPATH
tristate "Multipath I/O support"
depends on BLK_DEV_MD
@@ -201,6 +203,14 @@ config BLK_DEV_DM
If unsure, say N.
+config DM_DEBUG
+ boolean "Device mapper debugging support"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ ---help---
+ Enable this for messages that may help debug device-mapper problems.
+
+ If unsure, say N.
+
config DM_CRYPT
tristate "Crypt target support"
depends on BLK_DEV_DM && EXPERIMENTAL
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index ecc56765d949e..8e67634e79a0d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -613,6 +613,7 @@ static inline unsigned long file_page_offset(unsigned long chunk)
static inline struct page *filemap_get_page(struct bitmap *bitmap,
unsigned long chunk)
{
+ if (file_page_index(chunk) >= bitmap->file_pages) return NULL;
return bitmap->filemap[file_page_index(chunk) - file_page_index(0)];
}
@@ -739,6 +740,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
}
page = filemap_get_page(bitmap, chunk);
+ if (!page) return;
bit = file_page_offset(chunk);
/* set the bit */
@@ -1322,6 +1324,18 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
}
+/* dirty the memory and file bits for bitmap chunks "s" to "e" */
+void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
+{
+ unsigned long chunk;
+
+ for (chunk = s; chunk <= e; chunk++) {
+ sector_t sec = chunk << CHUNK_BLOCK_SHIFT(bitmap);
+ bitmap_set_memory_bits(bitmap, sec, 1);
+ bitmap_file_set_bit(bitmap, sec);
+ }
+}
+
/*
* flush out any pending updates
*/
@@ -1430,8 +1444,7 @@ int bitmap_create(mddev_t *mddev)
if (err)
goto error;
- bitmap->chunkshift = find_first_bit(&bitmap->chunksize,
- sizeof(bitmap->chunksize));
+ bitmap->chunkshift = ffz(~bitmap->chunksize);
/* now that chunksize and chunkshift are set, we can use these macros */
chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) /
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bdbd34993a80c..655d816760e59 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,6 +1,7 @@
/*
* Copyright (C) 2003 Christophe Saout <christophe@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
+ * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
@@ -22,17 +23,19 @@
#include "dm.h"
#define DM_MSG_PREFIX "crypt"
+#define MESG_STR(x) x, sizeof(x)
/*
* per bio private data
*/
struct crypt_io {
struct dm_target *target;
- struct bio *bio;
+ struct bio *base_bio;
struct bio *first_clone;
struct work_struct work;
atomic_t pending;
int error;
+ int post_process;
};
/*
@@ -63,6 +66,7 @@ struct crypt_iv_operations {
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
*/
+enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
struct crypt_config {
struct dm_dev *dev;
sector_t start;
@@ -73,6 +77,7 @@ struct crypt_config {
*/
mempool_t *io_pool;
mempool_t *page_pool;
+ struct bio_set *bs;
/*
* crypto related data
@@ -86,11 +91,12 @@ struct crypt_config {
char cipher[CRYPTO_MAX_ALG_NAME];
char chainmode[CRYPTO_MAX_ALG_NAME];
struct crypto_blkcipher *tfm;
+ unsigned long flags;
unsigned int key_size;
u8 key[0];
};
-#define MIN_IOS 256
+#define MIN_IOS 16
#define MIN_POOL_PAGES 32
#define MIN_BIO_PAGES 8
@@ -306,6 +312,14 @@ static int crypt_convert(struct crypt_config *cc,
return r;
}
+ static void dm_crypt_bio_destructor(struct bio *bio)
+ {
+ struct crypt_io *io = bio->bi_private;
+ struct crypt_config *cc = io->target->private;
+
+ bio_free(bio, cc->bs);
+ }
+
/*
* Generate a new unfragmented bio with the given size
* This should never violate the device limitations
@@ -315,34 +329,33 @@ static struct bio *
crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
struct bio *base_bio, unsigned int *bio_vec_idx)
{
- struct bio *bio;
+ struct bio *clone;
unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
unsigned int i;
- /*
- * Use __GFP_NOMEMALLOC to tell the VM to act less aggressively and
- * to fail earlier. This is not necessary but increases throughput.
- * FIXME: Is this really intelligent?
- */
- if (base_bio)
- bio = bio_clone(base_bio, GFP_NOIO|__GFP_NOMEMALLOC);
- else
- bio = bio_alloc(GFP_NOIO|__GFP_NOMEMALLOC, nr_iovecs);
- if (!bio)
+ if (base_bio) {
+ clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs);
+ __bio_clone(clone, base_bio);
+ } else
+ clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
+
+ if (!clone)
return NULL;
+ clone->bi_destructor = dm_crypt_bio_destructor;
+
/* if the last bio was not complete, continue where that one ended */
- bio->bi_idx = *bio_vec_idx;
- bio->bi_vcnt = *bio_vec_idx;
- bio->bi_size = 0;
- bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+ clone->bi_idx = *bio_vec_idx;
+ clone->bi_vcnt = *bio_vec_idx;
+ clone->bi_size = 0;
+ clone->bi_flags &= ~(1 << BIO_SEG_VALID);
- /* bio->bi_idx pages have already been allocated */
- size -= bio->bi_idx * PAGE_SIZE;
+ /* clone->bi_idx pages have already been allocated */
+ size -= clone->bi_idx * PAGE_SIZE;
- for(i = bio->bi_idx; i < nr_iovecs; i++) {
- struct bio_vec *bv = bio_iovec_idx(bio, i);
+ for (i = clone->bi_idx; i < nr_iovecs; i++) {
+ struct bio_vec *bv = bio_iovec_idx(clone, i);
bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask);
if (!bv->bv_page)
@@ -353,7 +366,7 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
* return a partially allocated bio, the caller will then try
* to allocate additional bios while submitting this partial bio
*/
- if ((i - bio->bi_idx) == (MIN_BIO_PAGES - 1))
+ if ((i - clone->bi_idx) == (MIN_BIO_PAGES - 1))
gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
bv->bv_offset = 0;
@@ -362,13 +375,13 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
else
bv->bv_len = size;
- bio->bi_size += bv->bv_len;
- bio->bi_vcnt++;
+ clone->bi_size += bv->bv_len;
+ clone->bi_vcnt++;
size -= bv->bv_len;
}
- if (!bio->bi_size) {
- bio_put(bio);
+ if (!clone->bi_size) {
+ bio_put(clone);
return NULL;
}
@@ -376,13 +389,13 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
* Remember the last bio_vec allocated to be able
* to correctly continue after the splitting.
*/
- *bio_vec_idx = bio->bi_vcnt;
+ *bio_vec_idx = clone->bi_vcnt;
- return bio;
+ return clone;
}
static void crypt_free_buffer_pages(struct crypt_config *cc,
- struct bio *bio, unsigned int bytes)
+ struct bio *clone, unsigned int bytes)
{
unsigned int i, start, end;
struct bio_vec *bv;
@@ -396,19 +409,19 @@ static void crypt_free_buffer_pages(struct crypt_config *cc,
* A fix to the bi_idx issue in the kernel is in the works, so
* we will hopefully be able to revert to the cleaner solution soon.
*/
- i = bio->bi_vcnt - 1;
- bv = bio_iovec_idx(bio, i);
- end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - bio->bi_size;
+ i = clone->bi_vcnt - 1;
+ bv = bio_iovec_idx(clone, i);
+ end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - clone->bi_size;
start = end - bytes;
start >>= PAGE_SHIFT;
- if (!bio->bi_size)
- end = bio->bi_vcnt;
+ if (!clone->bi_size)
+ end = clone->bi_vcnt;
else
end >>= PAGE_SHIFT;
- for(i = start; i < end; i++) {
- bv = bio_iovec_idx(bio, i);
+ for (i = start; i < end; i++) {
+ bv = bio_iovec_idx(clone, i);
BUG_ON(!bv->bv_page);
mempool_free(bv->bv_page, cc->page_pool);
bv->bv_page = NULL;
@@ -432,7 +445,7 @@ static void dec_pending(struct crypt_io *io, int error)
if (io->first_clone)
bio_put(io->first_clone);
- bio_endio(io->bio, io->bio->bi_size, io->error);
+ bio_endio(io->base_bio, io->base_bio->bi_size, io->error);
mempool_free(io, cc->io_pool);
}
@@ -441,29 +454,179 @@ static void dec_pending(struct crypt_io *io, int error)
* kcryptd:
*
* Needed because it would be very unwise to do decryption in an
- * interrupt context, so bios returning from read requests get
- * queued here.
+ * interrupt context.
*/
static struct workqueue_struct *_kcryptd_workqueue;
+static void kcryptd_do_work(void *data);
-static void kcryptd_do_work(void *data)
+static void kcryptd_queue_io(struct crypt_io *io)
{
- struct crypt_io *io = (struct crypt_io *) data;
- struct crypt_config *cc = (struct crypt_config *) io->target->private;
+ INIT_WORK(&io->work, kcryptd_do_work, io);
+ queue_work(_kcryptd_workqueue, &io->work);
+}
+
+static int crypt_endio(struct bio *clone, unsigned int done, int error)
+{
+ struct crypt_io *io = clone->bi_private;
+ struct crypt_config *cc = io->target->private;
+ unsigned read_io = bio_data_dir(clone) == READ;
+
+ /*
+ * free the processed pages, even if
+ * it's only a partially completed write
+ */
+ if (!read_io)
+ crypt_free_buffer_pages(cc, clone, done);
+
+ /* keep going - not finished yet */
+ if (unlikely(clone->bi_size))
+ return 1;
+
+ if (!read_io)
+ goto out;
+
+ if (unlikely(!bio_flagged(clone, BIO_UPTODATE))) {
+ error = -EIO;
+ goto out;
+ }
+
+ bio_put(clone);
+ io->post_process = 1;
+ kcryptd_queue_io(io);
+ return 0;
+
+out:
+ bio_put(clone);
+ dec_pending(io, error);
+ return error;
+}
+
+static void clone_init(struct crypt_io *io, struct bio *clone)
+{
+ struct crypt_config *cc = io->target->private;
+
+ clone->bi_private = io;
+ clone->bi_end_io = crypt_endio;
+ clone->bi_bdev = cc->dev->bdev;
+ clone->bi_rw = io->base_bio->bi_rw;
+}
+
+static void process_read(struct crypt_io *io)
+{
+ struct crypt_config *cc = io->target->private;
+ struct bio *base_bio = io->base_bio;
+ struct bio *clone;
+ sector_t sector = base_bio->bi_sector - io->target->begin;
+
+ atomic_inc(&io->pending);
+
+ /*
+ * The block layer might modify the bvec array, so always
+ * copy the required bvecs because we need the original
+ * one in order to decrypt the whole bio data *afterwards*.
+ */
+ clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
+ if (unlikely(!clone)) {
+ dec_pending(io, -ENOMEM);
+ return;
+ }
+
+ clone_init(io, clone);
+ clone->bi_destructor = dm_crypt_bio_destructor;
+ clone->bi_idx = 0;
+ clone->bi_vcnt = bio_segments(base_bio);
+ clone->bi_size = base_bio->bi_size;
+ clone->bi_sector = cc->start + sector;
+ memcpy(clone->bi_io_vec, bio_iovec(base_bio),
+ sizeof(struct bio_vec) * clone->bi_vcnt);
+
+ generic_make_request(clone);
+}
+
+static void process_write(struct crypt_io *io)
+{
+ struct crypt_config *cc = io->target->private;
+ struct bio *base_bio = io->base_bio;
+ struct bio *clone;
struct convert_context ctx;
- int r;
+ unsigned remaining = base_bio->bi_size;
+ sector_t sector = base_bio->bi_sector - io->target->begin;
+ unsigned bvec_idx = 0;
+
+ atomic_inc(&io->pending);
+
+ crypt_convert_init(cc, &ctx, NULL, base_bio, sector, 1);
+
+ /*
+ * The allocated buffers can be smaller than the whole bio,
+ * so repeat the whole process until all the data can be handled.
+ */
+ while (remaining) {
+ clone = crypt_alloc_buffer(cc, base_bio->bi_size,
+ io->first_clone, &bvec_idx);
+ if (unlikely(!clone)) {
+ dec_pending(io, -ENOMEM);
+ return;
+ }
+
+ ctx.bio_out = clone;
+
+ if (unlikely(crypt_convert(cc, &ctx) < 0)) {
+ crypt_free_buffer_pages(cc, clone, clone->bi_size);
+ bio_put(clone);
+ dec_pending(io, -EIO);
+ return;
+ }
+
+ clone_init(io, clone);
+ clone->bi_sector = cc->start + sector;
+
+ if (!io->first_clone) {
+ /*
+ * hold a reference to the first clone, because it
+ * holds the bio_vec array and that can't be freed
+ * before all other clones are released
+ */
+ bio_get(clone);
+ io->first_clone = clone;
+ }
+
+ remaining -= clone->bi_size;
+ sector += bio_sectors(clone);
+
+ /* prevent bio_put of first_clone */
+ if (remaining)
+ atomic_inc(&io->pending);
- crypt_convert_init(cc, &ctx, io->bio, io->bio,
- io->bio->bi_sector - io->target->begin, 0);
- r = crypt_convert(cc, &ctx);
+ generic_make_request(clone);
- dec_pending(io, r);
+ /* out of memory -> run queues */
+ if (remaining)
+ blk_congestion_wait(bio_data_dir(clone), HZ/100);
+ }
}
-static void kcryptd_queue_io(struct crypt_io *io)
+static void process_read_endio(struct crypt_io *io)
{
- INIT_WORK(&io->work, kcryptd_do_work, io);
- queue_work(_kcryptd_workqueue, &io->work);
+ struct crypt_config *cc = io->target->private;
+ struct convert_context ctx;
+
+ crypt_convert_init(cc, &ctx, io->base_bio, io->base_bio,
+ io->base_bio->bi_sector - io->target->begin, 0);
+
+ dec_pending(io, crypt_convert(cc, &ctx));
+}
+
+static void kcryptd_do_work(void *data)
+{
+ struct crypt_io *io = data;
+
+ if (io->post_process)
+ process_read_endio(io);
+ else if (bio_data_dir(io->base_bio) == READ)
+ process_read(io);
+ else
+ process_write(io);
}
/*
@@ -477,7 +640,7 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
buffer[2] = '\0';
- for(i = 0; i < size; i++) {
+ for (i = 0; i < size; i++) {
buffer[0] = *hex++;
buffer[1] = *hex++;
@@ -500,13 +663,38 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
{
unsigned int i;
- for(i = 0; i < size; i++) {
+ for (i = 0; i < size; i++) {
sprintf(hex, "%02x", *key);
hex += 2;
key++;
}
}
+static int crypt_set_key(struct crypt_config *cc, char *key)
+{
+ unsigned key_size = strlen(key) >> 1;
+
+ if (cc->key_size && cc->key_size != key_size)
+ return -EINVAL;
+
+ cc->key_size = key_size; /* initial settings */
+
+ if ((!key_size && strcmp(key, "-")) ||
+ (key_size && crypt_decode_key(cc->key, key, key_size) < 0))
+ return -EINVAL;
+
+ set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
+
+ return 0;
+}
+
+static int crypt_wipe_key(struct crypt_config *cc)
+{
+ clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
+ memset(&cc->key, 0, cc->key_size * sizeof(u8));
+ return 0;
+}
+
/*
* Construct an encryption mapping:
* <cipher> <key> <iv_offset> <dev_path> <start>
@@ -539,16 +727,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
key_size = strlen(argv[1]) >> 1;
- cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
+ cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
if (cc == NULL) {
ti->error =
"Cannot allocate transparent encryption context";
return -ENOMEM;
}
- cc->key_size = key_size;
- if ((!key_size && strcmp(argv[1], "-") != 0) ||
- (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) {
+ if (crypt_set_key(cc, argv[1])) {
ti->error = "Error decoding key";
goto bad1;
}
@@ -626,6 +812,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad4;
}
+ cc->bs = bioset_create(MIN_IOS, MIN_IOS, 4);
+ if (!cc->bs) {
+ ti->error = "Cannot allocate crypt bioset";
+ goto bad_bs;
+ }
+
if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) {
ti->error = "Error setting key";
goto bad5;
@@ -665,6 +857,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return 0;
bad5:
+ bioset_free(cc->bs);
+bad_bs:
mempool_destroy(cc->page_pool);
bad4:
mempool_destroy(cc->io_pool);
@@ -684,6 +878,7 @@ static void crypt_dtr(struct dm_target *ti)
{
struct crypt_config *cc = (struct crypt_config *) ti->private;
+ bioset_free(cc->bs);
mempool_destroy(cc->page_pool);
mempool_destroy(cc->io_pool);
@@ -698,147 +893,21 @@ static void crypt_dtr(struct dm_target *ti)
kfree(cc);
}
-static int crypt_endio(struct bio *bio, unsigned int done, int error)
-{
- struct crypt_io *io = (struct crypt_io *) bio->bi_private;
- struct crypt_config *cc = (struct crypt_config *) io->target->private;
-
- if (bio_data_dir(bio) == WRITE) {
- /*
- * free the processed pages, even if
- * it's only a partially completed write
- */
- crypt_free_buffer_pages(cc, bio, done);
- }
-
- if (bio->bi_size)
- return 1;
-
- bio_put(bio);
-
- /*
- * successful reads are decrypted by the worker thread
- */
- if ((bio_data_dir(bio) == READ)
- && bio_flagged(bio, BIO_UPTODATE)) {
- kcryptd_queue_io(io);
- return 0;
- }
-
- dec_pending(io, error);
- return error;
-}
-
-static inline struct bio *
-crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio,
- sector_t sector, unsigned int *bvec_idx,
- struct convert_context *ctx)
-{
- struct bio *clone;
-
- if (bio_data_dir(bio) == WRITE) {
- clone = crypt_alloc_buffer(cc, bio->bi_size,
- io->first_clone, bvec_idx);
- if (clone) {
- ctx->bio_out = clone;
- if (crypt_convert(cc, ctx) < 0) {
- crypt_free_buffer_pages(cc, clone,
- clone->bi_size);
- bio_put(clone);
- return NULL;
- }
- }
- } else {
- /*
- * The block layer might modify the bvec array, so always
- * copy the required bvecs because we need the original
- * one in order to decrypt the whole bio data *afterwards*.
- */
- clone = bio_alloc(GFP_NOIO, bio_segments(bio));
- if (clone) {
- clone->bi_idx = 0;
- clone->bi_vcnt = bio_segments(bio);
- clone->bi_size = bio->bi_size;
- memcpy(clone->bi_io_vec, bio_iovec(bio),
- sizeof(struct bio_vec) * clone->bi_vcnt);
- }
- }
-
- if (!clone)
- return NULL;
-
- clone->bi_private = io;
- clone->bi_end_io = crypt_endio;
- clone->bi_bdev = cc->dev->bdev;
- clone->bi_sector = cc->start + sector;
- clone->bi_rw = bio->bi_rw;
-
- return clone;
-}
-
static int crypt_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
- struct crypt_config *cc = (struct crypt_config *) ti->private;
- struct crypt_io *io = mempool_alloc(cc->io_pool, GFP_NOIO);
- struct convert_context ctx;
- struct bio *clone;
- unsigned int remaining = bio->bi_size;
- sector_t sector = bio->bi_sector - ti->begin;
- unsigned int bvec_idx = 0;
+ struct crypt_config *cc = ti->private;
+ struct crypt_io *io;
+ io = mempool_alloc(cc->io_pool, GFP_NOIO);
io->target = ti;
- io->bio = bio;
+ io->base_bio = bio;
io->first_clone = NULL;
- io->error = 0;
- atomic_set(&io->pending, 1); /* hold a reference */
-
- if (bio_data_dir(bio) == WRITE)
- crypt_convert_init(cc, &ctx, NULL, bio, sector, 1);
-
- /*
- * The allocated buffers can be smaller than the whole bio,
- * so repeat the whole process until all the data can be handled.
- */
- while (remaining) {
- clone = crypt_clone(cc, io, bio, sector, &bvec_idx, &ctx);
- if (!clone)
- goto cleanup;
-
- if (!io->first_clone) {
- /*
- * hold a reference to the first clone, because it
- * holds the bio_vec array and that can't be freed
- * before all other clones are released
- */
- bio_get(clone);
- io->first_clone = clone;
- }
- atomic_inc(&io->pending);
+ io->error = io->post_process = 0;
+ atomic_set(&io->pending, 0);
+ kcryptd_queue_io(io);
- remaining -= clone->bi_size;
- sector += bio_sectors(clone);
-
- generic_make_request(clone);
-
- /* out of memory -> run queues */
- if (remaining)
- blk_congestion_wait(bio_data_dir(clone), HZ/100);
- }
-
- /* drop reference, clones could have returned before we reach this */
- dec_pending(io, 0);
return 0;
-
-cleanup:
- if (io->first_clone) {
- dec_pending(io, -ENOMEM);
- return 0;
- }
-
- /* if no bio has been dispatched yet, we can directly return the error */
- mempool_free(io, cc->io_pool);
- return -ENOMEM;
}
static int crypt_status(struct dm_target *ti, status_type_t type,
@@ -883,14 +952,71 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
return 0;
}
+static void crypt_postsuspend(struct dm_target *ti)
+{
+ struct crypt_config *cc = ti->private;
+
+ set_bit(DM_CRYPT_SUSPENDED, &cc->flags);
+}
+
+static int crypt_preresume(struct dm_target *ti)
+{
+ struct crypt_config *cc = ti->private;
+
+ if (!test_bit(DM_CRYPT_KEY_VALID, &cc->flags)) {
+ DMERR("aborting resume - crypt key is not set.");
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+static void crypt_resume(struct dm_target *ti)
+{
+ struct crypt_config *cc = ti->private;
+
+ clear_bit(DM_CRYPT_SUSPENDED, &cc->flags);
+}
+
+/* Message interface
+ * key set <key>
+ * key wipe
+ */
+static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ struct crypt_config *cc = ti->private;
+
+ if (argc < 2)
+ goto error;
+
+ if (!strnicmp(argv[0], MESG_STR("key"))) {
+ if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) {
+ DMWARN("not suspended during key manipulation.");
+ return -EINVAL;
+ }
+ if (argc == 3 && !strnicmp(argv[1], MESG_STR("set")))
+ return crypt_set_key(cc, argv[2]);
+ if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe")))
+ return crypt_wipe_key(cc);
+ }
+
+error:
+ DMWARN("unrecognised message received.");
+ return -EINVAL;
+}
+
static struct target_type crypt_target = {
.name = "crypt",
- .version= {1, 1, 0},
+ .version= {1, 3, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
.map = crypt_map,
.status = crypt_status,
+ .postsuspend = crypt_postsuspend,
+ .preresume = crypt_preresume,
+ .resume = crypt_resume,
+ .message = crypt_message,
};
static int __init dm_crypt_init(void)
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index d12379b5cdb51..99cdffa7fbfe0 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#define DM_MSG_PREFIX "snapshots"
+#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
/*-----------------------------------------------------------------
* Persistent snapshots, by persistent we mean that the snapshot
@@ -150,6 +151,7 @@ static int alloc_area(struct pstore *ps)
static void free_area(struct pstore *ps)
{
vfree(ps->area);
+ ps->area = NULL;
}
/*
@@ -198,48 +200,79 @@ static int read_header(struct pstore *ps, int *new_snapshot)
int r;
struct disk_header *dh;
chunk_t chunk_size;
+ int chunk_size_supplied = 1;
- r = chunk_io(ps, 0, READ);
+ /*
+ * Use default chunk size (or hardsect_size, if larger) if none supplied
+ */
+ if (!ps->snap->chunk_size) {
+ ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
+ bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
+ ps->snap->chunk_mask = ps->snap->chunk_size - 1;
+ ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
+ chunk_size_supplied = 0;
+ }
+
+ r = dm_io_get(sectors_to_pages(ps->snap->chunk_size));
if (r)
return r;
+ r = alloc_area(ps);
+ if (r)
+ goto bad1;
+
+ r = chunk_io(ps, 0, READ);
+ if (r)
+ goto bad2;
+
dh = (struct disk_header *) ps->area;
if (le32_to_cpu(dh->magic) == 0) {
*new_snapshot = 1;
+ return 0;
+ }
- } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
- *new_snapshot = 0;
- ps->valid = le32_to_cpu(dh->valid);
- ps->version = le32_to_cpu(dh->version);
- chunk_size = le32_to_cpu(dh->chunk_size);
- if (ps->snap->chunk_size != chunk_size) {
- DMWARN("chunk size %llu in device metadata overrides "
- "table chunk size of %llu.",
- (unsigned long long)chunk_size,
- (unsigned long long)ps->snap->chunk_size);
-
- /* We had a bogus chunk_size. Fix stuff up. */
- dm_io_put(sectors_to_pages(ps->snap->chunk_size));
- free_area(ps);
-
- ps->snap->chunk_size = chunk_size;
- ps->snap->chunk_mask = chunk_size - 1;
- ps->snap->chunk_shift = ffs(chunk_size) - 1;
-
- r = alloc_area(ps);
- if (r)
- return r;
-
- r = dm_io_get(sectors_to_pages(chunk_size));
- if (r)
- return r;
- }
- } else {
- DMWARN("Invalid/corrupt snapshot");
+ if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
+ DMWARN("Invalid or corrupt snapshot");
r = -ENXIO;
+ goto bad2;
}
+ *new_snapshot = 0;
+ ps->valid = le32_to_cpu(dh->valid);
+ ps->version = le32_to_cpu(dh->version);
+ chunk_size = le32_to_cpu(dh->chunk_size);
+
+ if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
+ return 0;
+
+ DMWARN("chunk size %llu in device metadata overrides "
+ "table chunk size of %llu.",
+ (unsigned long long)chunk_size,
+ (unsigned long long)ps->snap->chunk_size);
+
+ /* We had a bogus chunk_size. Fix stuff up. */
+ dm_io_put(sectors_to_pages(ps->snap->chunk_size));
+ free_area(ps);
+
+ ps->snap->chunk_size = chunk_size;
+ ps->snap->chunk_mask = chunk_size - 1;
+ ps->snap->chunk_shift = ffs(chunk_size) - 1;
+
+ r = dm_io_get(sectors_to_pages(chunk_size));
+ if (r)
+ return r;
+
+ r = alloc_area(ps);
+ if (r)
+ goto bad1;
+
+ return 0;
+
+bad2:
+ free_area(ps);
+bad1:
+ dm_io_put(sectors_to_pages(ps->snap->chunk_size));
return r;
}
@@ -263,42 +296,29 @@ static int write_header(struct pstore *ps)
*/
static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
{
- if (index >= ps->exceptions_per_area)
- return NULL;
+ BUG_ON(index >= ps->exceptions_per_area);
return ((struct disk_exception *) ps->area) + index;
}
-static int read_exception(struct pstore *ps,
- uint32_t index, struct disk_exception *result)
+static void read_exception(struct pstore *ps,
+ uint32_t index, struct disk_exception *result)
{
- struct disk_exception *e;
-
- e = get_exception(ps, index);
- if (!e)
- return -EINVAL;
+ struct disk_exception *e = get_exception(ps, index);
/* copy it */
result->old_chunk = le64_to_cpu(e->old_chunk);
result->new_chunk = le64_to_cpu(e->new_chunk);
-
- return 0;
}
-static int write_exception(struct pstore *ps,
- uint32_t index, struct disk_exception *de)
+static void write_exception(struct pstore *ps,
+ uint32_t index, struct disk_exception *de)
{
- struct disk_exception *e;
-
- e = get_exception(ps, index);
- if (!e)
- return -EINVAL;
+ struct disk_exception *e = get_exception(ps, index);
/* copy it */
e->old_chunk = cpu_to_le64(de->old_chunk);
e->new_chunk = cpu_to_le64(de->new_chunk);
-
- return 0;
}
/*
@@ -316,10 +336,7 @@ static int insert_exceptions(struct pstore *ps, int *full)
*full = 1;
for (i = 0; i < ps->exceptions_per_area; i++) {
- r = read_exception(ps, i, &de);
-
- if (r)
- return r;
+ read_exception(ps, i, &de);
/*
* If the new_chunk is pointing at the start of
@@ -519,6 +536,16 @@ static void persistent_commit(struct exception_store *store,
if (r)
ps->valid = 0;
+ /*
+ * Have we completely filled the current area ?
+ */
+ if (ps->current_committed == ps->exceptions_per_area) {
+ ps->current_committed = 0;
+ r = zero_area(ps, ps->current_area + 1);
+ if (r)
+ ps->valid = 0;
+ }
+
for (i = 0; i < ps->callback_count; i++) {
cb = ps->callbacks + i;
cb->callback(cb->context, r == 0 ? 1 : 0);
@@ -526,16 +553,6 @@ static void persistent_commit(struct exception_store *store,
ps->callback_count = 0;
}
-
- /*
- * Have we completely filled the current area ?
- */
- if (ps->current_committed == ps->exceptions_per_area) {
- ps->current_committed = 0;
- r = zero_area(ps, ps->current_area + 1);
- if (r)
- ps->valid = 0;
- }
}
static void persistent_drop(struct exception_store *store)
@@ -547,32 +564,22 @@ static void persistent_drop(struct exception_store *store)
DMWARN("write header failed");
}
-int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
+int dm_create_persistent(struct exception_store *store)
{
- int r;
struct pstore *ps;
- r = dm_io_get(sectors_to_pages(chunk_size));
- if (r)
- return r;
-
/* allocate the pstore */
ps = kmalloc(sizeof(*ps), GFP_KERNEL);
- if (!ps) {
- r = -ENOMEM;
- goto bad;
- }
+ if (!ps)
+ return -ENOMEM;
ps->snap = store->snap;
ps->valid = 1;
ps->version = SNAPSHOT_DISK_VERSION;
+ ps->area = NULL;
ps->next_free = 2; /* skipping the header and first area */
ps->current_committed = 0;
- r = alloc_area(ps);
- if (r)
- goto bad;
-
ps->callback_count = 0;
atomic_set(&ps->pending_count, 0);
ps->callbacks = NULL;
@@ -586,13 +593,6 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
store->context = ps;
return 0;
-
- bad:
- dm_io_put(sectors_to_pages(chunk_size));
- if (ps && ps->area)
- free_area(ps);
- kfree(ps);
- return r;
}
/*-----------------------------------------------------------------
@@ -642,18 +642,16 @@ static void transient_fraction_full(struct exception_store *store,
*denominator = get_dev_size(store->snap->cow->bdev);
}
-int dm_create_transient(struct exception_store *store,
- struct dm_snapshot *s, int blocksize)
+int dm_create_transient(struct exception_store *store)
{
struct transient_c *tc;
- memset(store, 0, sizeof(*store));
store->destroy = transient_destroy;
store->read_metadata = transient_read_metadata;
store->prepare_exception = transient_prepare;
store->commit_exception = transient_commit;
+ store->drop_snapshot = NULL;
store->fraction_full = transient_fraction_full;
- store->snap = s;
tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
if (!tc)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 47b3c62bbdb88..00234909b3db0 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -98,14 +98,31 @@ static int linear_status(struct dm_target *ti, status_type_t type,
return 0;
}
+static int linear_ioctl(struct dm_target *ti, struct inode *inode,
+ struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct linear_c *lc = (struct linear_c *) ti->private;
+ struct block_device *bdev = lc->dev->bdev;
+ struct file fake_file = {};
+ struct dentry fake_dentry = {};
+
+ fake_file.f_mode = lc->dev->mode;
+ fake_file.f_dentry = &fake_dentry;
+ fake_dentry.d_inode = bdev->bd_inode;
+
+ return blkdev_driver_ioctl(bdev->bd_inode, &fake_file, bdev->bd_disk, cmd, arg);
+}
+
static struct target_type linear_target = {
.name = "linear",
- .version= {1, 0, 1},
+ .version= {1, 0, 2},
.module = THIS_MODULE,
.ctr = linear_ctr,
.dtr = linear_dtr,
.map = linear_map,
.status = linear_status,
+ .ioctl = linear_ioctl,
};
int __init dm_linear_init(void)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 93f701ea87bc3..d754e0bc6e90c 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -114,12 +114,10 @@ static void trigger_event(void *data);
static struct pgpath *alloc_pgpath(void)
{
- struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL);
+ struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
- if (pgpath) {
- memset(pgpath, 0, sizeof(*pgpath));
+ if (pgpath)
pgpath->path.is_active = 1;
- }
return pgpath;
}
@@ -133,12 +131,10 @@ static struct priority_group *alloc_priority_group(void)
{
struct priority_group *pg;
- pg = kmalloc(sizeof(*pg), GFP_KERNEL);
- if (!pg)
- return NULL;
+ pg = kzalloc(sizeof(*pg), GFP_KERNEL);
- memset(pg, 0, sizeof(*pg));
- INIT_LIST_HEAD(&pg->pgpaths);
+ if (pg)
+ INIT_LIST_HEAD(&pg->pgpaths);
return pg;
}
@@ -168,13 +164,12 @@ static void free_priority_group(struct priority_group *pg,
kfree(pg);
}
-static struct multipath *alloc_multipath(void)
+static struct multipath *alloc_multipath(struct dm_target *ti)
{
struct multipath *m;
- m = kmalloc(sizeof(*m), GFP_KERNEL);
+ m = kzalloc(sizeof(*m), GFP_KERNEL);
if (m) {
- memset(m, 0, sizeof(*m));
INIT_LIST_HEAD(&m->priority_groups);
spin_lock_init(&m->lock);
m->queue_io = 1;
@@ -185,6 +180,8 @@ static struct multipath *alloc_multipath(void)
kfree(m);
return NULL;
}
+ m->ti = ti;
+ ti->private = m;
}
return m;
@@ -557,8 +554,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
}
static struct priority_group *parse_priority_group(struct arg_set *as,
- struct multipath *m,
- struct dm_target *ti)
+ struct multipath *m)
{
static struct param _params[] = {
{1, 1024, "invalid number of paths"},
@@ -568,6 +564,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
int r;
unsigned i, nr_selector_args, nr_params;
struct priority_group *pg;
+ struct dm_target *ti = m->ti;
if (as->argc < 2) {
as->argc = 0;
@@ -624,12 +621,12 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
return NULL;
}
-static int parse_hw_handler(struct arg_set *as, struct multipath *m,
- struct dm_target *ti)
+static int parse_hw_handler(struct arg_set *as, struct multipath *m)
{
int r;
struct hw_handler_type *hwht;
unsigned hw_argc;
+ struct dm_target *ti = m->ti;
static struct param _params[] = {
{0, 1024, "invalid number of hardware handler args"},
@@ -661,11 +658,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m,
return 0;
}
-static int parse_features(struct arg_set *as, struct multipath *m,
- struct dm_target *ti)
+static int parse_features(struct arg_set *as, struct multipath *m)
{
int r;
unsigned argc;
+ struct dm_target *ti = m->ti;
static struct param _params[] = {
{0, 1, "invalid number of feature args"},
@@ -704,19 +701,17 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
as.argc = argc;
as.argv = argv;
- m = alloc_multipath();
+ m = alloc_multipath(ti);
if (!m) {
ti->error = "can't allocate multipath";
return -EINVAL;
}
- m->ti = ti;
-
- r = parse_features(&as, m, ti);
+ r = parse_features(&as, m);
if (r)
goto bad;
- r = parse_hw_handler(&as, m, ti);
+ r = parse_hw_handler(&as, m);
if (r)
goto bad;
@@ -732,7 +727,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
while (as.argc) {
struct priority_group *pg;
- pg = parse_priority_group(&as, m, ti);
+ pg = parse_priority_group(&as, m);
if (!pg) {
r = -EINVAL;
goto bad;
@@ -752,8 +747,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
goto bad;
}
- ti->private = m;
-
return 0;
bad:
@@ -1266,12 +1259,47 @@ error:
return -EINVAL;
}
+static int multipath_ioctl(struct dm_target *ti, struct inode *inode,
+ struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct multipath *m = (struct multipath *) ti->private;
+ struct block_device *bdev = NULL;
+ unsigned long flags;
+ struct file fake_file = {};
+ struct dentry fake_dentry = {};
+ int r = 0;
+
+ fake_file.f_dentry = &fake_dentry;
+
+ spin_lock_irqsave(&m->lock, flags);
+
+ if (!m->current_pgpath)
+ __choose_pgpath(m);
+
+ if (m->current_pgpath) {
+ bdev = m->current_pgpath->path.dev->bdev;
+ fake_dentry.d_inode = bdev->bd_inode;
+ fake_file.f_mode = m->current_pgpath->path.dev->mode;
+ }
+
+ if (m->queue_io)
+ r = -EAGAIN;
+ else if (!bdev)
+ r = -EIO;
+
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ return r ? : blkdev_driver_ioctl(bdev->bd_inode, &fake_file,
+ bdev->bd_disk, cmd, arg);
+}
+
/*-----------------------------------------------------------------
* Module setup
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 0, 4},
+ .version = {1, 0, 5},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
@@ -1281,6 +1309,7 @@ static struct target_type multipath_target = {
.resume = multipath_resume,
.status = multipath_status,
.message = multipath_message,
+ .ioctl = multipath_ioctl,
};
static int __init dm_multipath_init(void)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index c54de989eb005..659224cb7c533 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1213,9 +1213,9 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE:
- DMEMIT("%d ", ms->nr_mirrors);
+ DMEMIT("%d", ms->nr_mirrors);
for (m = 0; m < ms->nr_mirrors; m++)
- DMEMIT("%s %llu ", ms->mirror[m].dev->name,
+ DMEMIT(" %s %llu", ms->mirror[m].dev->name,
(unsigned long long)ms->mirror[m].offset);
}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 1d0fafda0f761..5281e0094072b 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -39,6 +39,9 @@
*/
#define SNAPSHOT_PAGES 256
+struct workqueue_struct *ksnapd;
+static void flush_queued_bios(void *data);
+
struct pending_exception {
struct exception e;
@@ -56,7 +59,7 @@ struct pending_exception {
/*
* The primary pending_exception is the one that holds
- * the sibling_count and the list of origin_bios for a
+ * the ref_count and the list of origin_bios for a
* group of pending_exceptions. It is always last to get freed.
* These fields get set up when writing to the origin.
*/
@@ -69,7 +72,7 @@ struct pending_exception {
* the sibling concerned and not pe->primary_pe->snap->lock unless
* they are the same.
*/
- atomic_t sibling_count;
+ atomic_t ref_count;
/* Pointer back to snapshot context */
struct dm_snapshot *snap;
@@ -387,15 +390,46 @@ static inline ulong round_up(ulong n, ulong size)
return (n + size) & ~size;
}
-static void read_snapshot_metadata(struct dm_snapshot *s)
+static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg,
+ char **error)
{
- if (s->store.read_metadata(&s->store)) {
- down_write(&s->lock);
- s->valid = 0;
- up_write(&s->lock);
+ unsigned long chunk_size;
+ char *value;
+
+ chunk_size = simple_strtoul(chunk_size_arg, &value, 10);
+ if (*chunk_size_arg == '\0' || *value != '\0') {
+ *error = "Invalid chunk size";
+ return -EINVAL;
+ }
+
+ if (!chunk_size) {
+ s->chunk_size = s->chunk_mask = s->chunk_shift = 0;
+ return 0;
+ }
+
+ /*
+ * Chunk size must be multiple of page size. Silently
+ * round up if it's not.
+ */
+ chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
+
+ /* Check chunk_size is a power of 2 */
+ if (chunk_size & (chunk_size - 1)) {
+ *error = "Chunk size is not a power of 2";
+ return -EINVAL;
+ }
- dm_table_event(s->table);
+ /* Validate the chunk size against the device block size */
+ if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) {
+ *error = "Chunk size is not a multiple of device blocksize";
+ return -EINVAL;
}
+
+ s->chunk_size = chunk_size;
+ s->chunk_mask = chunk_size - 1;
+ s->chunk_shift = ffs(chunk_size) - 1;
+
+ return 0;
}
/*
@@ -404,15 +438,12 @@ static void read_snapshot_metadata(struct dm_snapshot *s)
static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct dm_snapshot *s;
- unsigned long chunk_size;
int r = -EINVAL;
char persistent;
char *origin_path;
char *cow_path;
- char *value;
- int blocksize;
- if (argc < 4) {
+ if (argc != 4) {
ti->error = "requires exactly 4 arguments";
r = -EINVAL;
goto bad1;
@@ -428,13 +459,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad1;
}
- chunk_size = simple_strtoul(argv[3], &value, 10);
- if (chunk_size == 0 || value == NULL) {
- ti->error = "Invalid chunk size";
- r = -EINVAL;
- goto bad1;
- }
-
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s == NULL) {
ti->error = "Cannot allocate snapshot context private "
@@ -457,36 +481,17 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad2;
}
- /*
- * Chunk size must be multiple of page size. Silently
- * round up if it's not.
- */
- chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
-
- /* Validate the chunk size against the device block size */
- blocksize = s->cow->bdev->bd_disk->queue->hardsect_size;
- if (chunk_size % (blocksize >> 9)) {
- ti->error = "Chunk size is not a multiple of device blocksize";
- r = -EINVAL;
- goto bad3;
- }
-
- /* Check chunk_size is a power of 2 */
- if (chunk_size & (chunk_size - 1)) {
- ti->error = "Chunk size is not a power of 2";
- r = -EINVAL;
+ r = set_chunk_size(s, argv[3], &ti->error);
+ if (r)
goto bad3;
- }
- s->chunk_size = chunk_size;
- s->chunk_mask = chunk_size - 1;
s->type = persistent;
- s->chunk_shift = ffs(chunk_size) - 1;
s->valid = 1;
s->active = 0;
s->last_percent = 0;
init_rwsem(&s->lock);
+ spin_lock_init(&s->pe_lock);
s->table = ti->table;
/* Allocate hash table for COW data */
@@ -496,16 +501,12 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad3;
}
- /*
- * Check the persistent flag - done here because we need the iobuf
- * to check the LV header
- */
s->store.snap = s;
if (persistent == 'P')
- r = dm_create_persistent(&s->store, chunk_size);
+ r = dm_create_persistent(&s->store);
else
- r = dm_create_transient(&s->store, s, blocksize);
+ r = dm_create_transient(&s->store);
if (r) {
ti->error = "Couldn't create exception store";
@@ -520,7 +521,14 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
/* Metadata must only be loaded into one table at once */
- read_snapshot_metadata(s);
+ r = s->store.read_metadata(&s->store);
+ if (r) {
+ ti->error = "Failed to read snapshot metadata";
+ goto bad6;
+ }
+
+ bio_list_init(&s->queued_bios);
+ INIT_WORK(&s->queued_bios_work, flush_queued_bios, s);
/* Add snapshot to the list of snapshots for this origin */
/* Exceptions aren't triggered till snapshot_resume() is called */
@@ -560,6 +568,8 @@ static void snapshot_dtr(struct dm_target *ti)
{
struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+ flush_workqueue(ksnapd);
+
/* Prevent further origin writes from using this snapshot. */
/* After this returns there can be no new kcopyd jobs. */
unregister_snapshot(s);
@@ -593,6 +603,19 @@ static void flush_bios(struct bio *bio)
}
}
+static void flush_queued_bios(void *data)
+{
+ struct dm_snapshot *s = (struct dm_snapshot *) data;
+ struct bio *queued_bios;
+ unsigned long flags;
+
+ spin_lock_irqsave(&s->pe_lock, flags);
+ queued_bios = bio_list_get(&s->queued_bios);
+ spin_unlock_irqrestore(&s->pe_lock, flags);
+
+ flush_bios(queued_bios);
+}
+
/*
* Error a list of buffers.
*/
@@ -608,28 +631,7 @@ static void error_bios(struct bio *bio)
}
}
-static inline void error_snapshot_bios(struct pending_exception *pe)
-{
- error_bios(bio_list_get(&pe->snapshot_bios));
-}
-
-static struct bio *__flush_bios(struct pending_exception *pe)
-{
- /*
- * If this pe is involved in a write to the origin and
- * it is the last sibling to complete then release
- * the bios for the original write to the origin.
- */
-
- if (pe->primary_pe &&
- atomic_dec_and_test(&pe->primary_pe->sibling_count))
- return bio_list_get(&pe->primary_pe->origin_bios);
-
- return NULL;
-}
-
-static void __invalidate_snapshot(struct dm_snapshot *s,
- struct pending_exception *pe, int err)
+static void __invalidate_snapshot(struct dm_snapshot *s, int err)
{
if (!s->valid)
return;
@@ -639,9 +641,6 @@ static void __invalidate_snapshot(struct dm_snapshot *s,
else if (err == -ENOMEM)
DMERR("Invalidating snapshot: Unable to allocate exception.");
- if (pe)
- remove_exception(&pe->e);
-
if (s->store.drop_snapshot)
s->store.drop_snapshot(&s->store);
@@ -650,78 +649,95 @@ static void __invalidate_snapshot(struct dm_snapshot *s,
dm_table_event(s->table);
}
+static void get_pending_exception(struct pending_exception *pe)
+{
+ atomic_inc(&pe->ref_count);
+}
+
+static struct bio *put_pending_exception(struct pending_exception *pe)
+{
+ struct pending_exception *primary_pe;
+ struct bio *origin_bios = NULL;
+
+ primary_pe = pe->primary_pe;
+
+ /*
+ * If this pe is involved in a write to the origin and
+ * it is the last sibling to complete then release
+ * the bios for the original write to the origin.
+ */
+ if (primary_pe &&
+ atomic_dec_and_test(&primary_pe->ref_count))
+ origin_bios = bio_list_get(&primary_pe->origin_bios);
+
+ /*
+ * Free the pe if it's not linked to an origin write or if
+ * it's not itself a primary pe.
+ */
+ if (!primary_pe || primary_pe != pe)
+ free_pending_exception(pe);
+
+ /*
+ * Free the primary pe if nothing references it.
+ */
+ if (primary_pe && !atomic_read(&primary_pe->ref_count))
+ free_pending_exception(primary_pe);
+
+ return origin_bios;
+}
+
static void pending_complete(struct pending_exception *pe, int success)
{
struct exception *e;
- struct pending_exception *primary_pe;
struct dm_snapshot *s = pe->snap;
- struct bio *flush = NULL;
+ struct bio *origin_bios = NULL;
+ struct bio *snapshot_bios = NULL;
+ int error = 0;
if (!success) {
/* Read/write error - snapshot is unusable */
down_write(&s->lock);
- __invalidate_snapshot(s, pe, -EIO);
- flush = __flush_bios(pe);
- up_write(&s->lock);
-
- error_snapshot_bios(pe);
+ __invalidate_snapshot(s, -EIO);
+ error = 1;
goto out;
}
e = alloc_exception();
if (!e) {
down_write(&s->lock);
- __invalidate_snapshot(s, pe, -ENOMEM);
- flush = __flush_bios(pe);
- up_write(&s->lock);
-
- error_snapshot_bios(pe);
+ __invalidate_snapshot(s, -ENOMEM);
+ error = 1;
goto out;
}
*e = pe->e;
- /*
- * Add a proper exception, and remove the
- * in-flight exception from the list.
- */
down_write(&s->lock);
if (!s->valid) {
- flush = __flush_bios(pe);
- up_write(&s->lock);
-
free_exception(e);
-
- error_snapshot_bios(pe);
+ error = 1;
goto out;
}
+ /*
+ * Add a proper exception, and remove the
+ * in-flight exception from the list.
+ */
insert_exception(&s->complete, e);
+
+ out:
remove_exception(&pe->e);
- flush = __flush_bios(pe);
+ snapshot_bios = bio_list_get(&pe->snapshot_bios);
+ origin_bios = put_pending_exception(pe);
up_write(&s->lock);
/* Submit any pending write bios */
- flush_bios(bio_list_get(&pe->snapshot_bios));
-
- out:
- primary_pe = pe->primary_pe;
-
- /*
- * Free the pe if it's not linked to an origin write or if
- * it's not itself a primary pe.
- */
- if (!primary_pe || primary_pe != pe)
- free_pending_exception(pe);
-
- /*
- * Free the primary pe if nothing references it.
- */
- if (primary_pe && !atomic_read(&primary_pe->sibling_count))
- free_pending_exception(primary_pe);
+ if (error)
+ error_bios(snapshot_bios);
+ else
+ flush_bios(snapshot_bios);
- if (flush)
- flush_bios(flush);
+ flush_bios(origin_bios);
}
static void commit_callback(void *context, int success)
@@ -822,7 +838,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
bio_list_init(&pe->origin_bios);
bio_list_init(&pe->snapshot_bios);
pe->primary_pe = NULL;
- atomic_set(&pe->sibling_count, 1);
+ atomic_set(&pe->ref_count, 0);
pe->snap = s;
pe->started = 0;
@@ -831,6 +847,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
return NULL;
}
+ get_pending_exception(pe);
insert_exception(&s->pending, &pe->e);
out:
@@ -850,7 +867,6 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
{
struct exception *e;
struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
- int copy_needed = 0;
int r = 1;
chunk_t chunk;
struct pending_exception *pe = NULL;
@@ -865,32 +881,31 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
if (unlikely(bio_barrier(bio)))
return -EOPNOTSUPP;
+ /* FIXME: should only take write lock if we need
+ * to copy an exception */
+ down_write(&s->lock);
+
+ if (!s->valid) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ /* If the block is already remapped - use that, else remap it */
+ e = lookup_exception(&s->complete, chunk);
+ if (e) {
+ remap_exception(s, e, bio);
+ goto out_unlock;
+ }
+
/*
* Write to snapshot - higher level takes care of RW/RO
* flags so we should only get this if we are
* writeable.
*/
if (bio_rw(bio) == WRITE) {
-
- /* FIXME: should only take write lock if we need
- * to copy an exception */
- down_write(&s->lock);
-
- if (!s->valid) {
- r = -EIO;
- goto out_unlock;
- }
-
- /* If the block is already remapped - use that, else remap it */
- e = lookup_exception(&s->complete, chunk);
- if (e) {
- remap_exception(s, e, bio);
- goto out_unlock;
- }
-
pe = __find_pending_exception(s, bio);
if (!pe) {
- __invalidate_snapshot(s, pe, -ENOMEM);
+ __invalidate_snapshot(s, -ENOMEM);
r = -EIO;
goto out_unlock;
}
@@ -898,45 +913,27 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
remap_exception(s, &pe->e, bio);
bio_list_add(&pe->snapshot_bios, bio);
+ r = 0;
+
if (!pe->started) {
/* this is protected by snap->lock */
pe->started = 1;
- copy_needed = 1;
- }
-
- r = 0;
-
- out_unlock:
- up_write(&s->lock);
-
- if (copy_needed)
+ up_write(&s->lock);
start_copy(pe);
- } else {
+ goto out;
+ }
+ } else
/*
* FIXME: this read path scares me because we
* always use the origin when we have a pending
* exception. However I can't think of a
* situation where this is wrong - ejt.
*/
+ bio->bi_bdev = s->origin->bdev;
- /* Do reads */
- down_read(&s->lock);
-
- if (!s->valid) {
- up_read(&s->lock);
- return -EIO;
- }
-
- /* See if it it has been remapped */
- e = lookup_exception(&s->complete, chunk);
- if (e)
- remap_exception(s, e, bio);
- else
- bio->bi_bdev = s->origin->bdev;
-
- up_read(&s->lock);
- }
-
+ out_unlock:
+ up_write(&s->lock);
+ out:
return r;
}
@@ -1025,7 +1022,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
* is already remapped in this snapshot
* and trigger an exception if not.
*
- * sibling_count is initialised to 1 so pending_complete()
+ * ref_count is initialised to 1 so pending_complete()
* won't destroy the primary_pe while we're inside this loop.
*/
e = lookup_exception(&snap->complete, chunk);
@@ -1034,7 +1031,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
pe = __find_pending_exception(snap, bio);
if (!pe) {
- __invalidate_snapshot(snap, pe, ENOMEM);
+ __invalidate_snapshot(snap, -ENOMEM);
goto next_snapshot;
}
@@ -1056,8 +1053,8 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
}
if (!pe->primary_pe) {
- atomic_inc(&primary_pe->sibling_count);
pe->primary_pe = primary_pe;
+ get_pending_exception(primary_pe);
}
if (!pe->started) {
@@ -1070,20 +1067,20 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
}
if (!primary_pe)
- goto out;
+ return r;
/*
* If this is the first time we're processing this chunk and
- * sibling_count is now 1 it means all the pending exceptions
+ * ref_count is now 1 it means all the pending exceptions
* got completed while we were in the loop above, so it falls to
* us here to remove the primary_pe and submit any origin_bios.
*/
- if (first && atomic_dec_and_test(&primary_pe->sibling_count)) {
+ if (first && atomic_dec_and_test(&primary_pe->ref_count)) {
flush_bios(bio_list_get(&primary_pe->origin_bios));
free_pending_exception(primary_pe);
/* If we got here, pe_queue is necessarily empty. */
- goto out;
+ return r;
}
/*
@@ -1092,7 +1089,6 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
start_copy(pe);
- out:
return r;
}
@@ -1205,7 +1201,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result,
static struct target_type origin_target = {
.name = "snapshot-origin",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.module = THIS_MODULE,
.ctr = origin_ctr,
.dtr = origin_dtr,
@@ -1216,7 +1212,7 @@ static struct target_type origin_target = {
static struct target_type snapshot_target = {
.name = "snapshot",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
@@ -1275,8 +1271,17 @@ static int __init dm_snapshot_init(void)
goto bad5;
}
+ ksnapd = create_singlethread_workqueue("ksnapd");
+ if (!ksnapd) {
+ DMERR("Failed to create ksnapd workqueue.");
+ r = -ENOMEM;
+ goto bad6;
+ }
+
return 0;
+ bad6:
+ mempool_destroy(pending_pool);
bad5:
kmem_cache_destroy(pending_cache);
bad4:
@@ -1294,6 +1299,8 @@ static void __exit dm_snapshot_exit(void)
{
int r;
+ destroy_workqueue(ksnapd);
+
r = dm_unregister_target(&snapshot_target);
if (r)
DMERR("snapshot unregister failed %d", r);
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index fdec1e2dc8718..15fa2ae6cdc29 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -10,7 +10,9 @@
#define DM_SNAPSHOT_H
#include "dm.h"
+#include "dm-bio-list.h"
#include <linux/blkdev.h>
+#include <linux/workqueue.h>
struct exception_table {
uint32_t hash_mask;
@@ -112,10 +114,20 @@ struct dm_snapshot {
struct exception_table pending;
struct exception_table complete;
+ /*
+ * pe_lock protects all pending_exception operations and access
+ * as well as the snapshot_bios list.
+ */
+ spinlock_t pe_lock;
+
/* The on disk metadata handler */
struct exception_store store;
struct kcopyd_client *kcopyd_client;
+
+ /* Queue of snapshot writes for ksnapd to flush */
+ struct bio_list queued_bios;
+ struct work_struct queued_bios_work;
};
/*
@@ -128,10 +140,9 @@ int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
* Constructor and destructor for the default persistent
* store.
*/
-int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
+int dm_create_persistent(struct exception_store *store);
-int dm_create_transient(struct exception_store *store,
- struct dm_snapshot *s, int blocksize);
+int dm_create_transient(struct exception_store *store);
/*
* Return the number of sectors in the device.
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 75fe9493e6af4..05befa91807a3 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -522,56 +522,61 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
return 0;
}
-
-int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
- sector_t len, int mode, struct dm_dev **result)
+void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
{
- int r = __table_get_device(ti->table, ti, path,
- start, len, mode, result);
- if (!r) {
- request_queue_t *q = bdev_get_queue((*result)->bdev);
- struct io_restrictions *rs = &ti->limits;
-
- /*
- * Combine the device limits low.
- *
- * FIXME: if we move an io_restriction struct
- * into q this would just be a call to
- * combine_restrictions_low()
- */
+ request_queue_t *q = bdev_get_queue(bdev);
+ struct io_restrictions *rs = &ti->limits;
+
+ /*
+ * Combine the device limits low.
+ *
+ * FIXME: if we move an io_restriction struct
+ * into q this would just be a call to
+ * combine_restrictions_low()
+ */
+ rs->max_sectors =
+ min_not_zero(rs->max_sectors, q->max_sectors);
+
+ /* FIXME: Device-Mapper on top of RAID-0 breaks because DM
+ * currently doesn't honor MD's merge_bvec_fn routine.
+ * In this case, we'll force DM to use PAGE_SIZE or
+ * smaller I/O, just to be safe. A better fix is in the
+ * works, but add this for the time being so it will at
+ * least operate correctly.
+ */
+ if (q->merge_bvec_fn)
rs->max_sectors =
- min_not_zero(rs->max_sectors, q->max_sectors);
+ min_not_zero(rs->max_sectors,
+ (unsigned int) (PAGE_SIZE >> 9));
- /* FIXME: Device-Mapper on top of RAID-0 breaks because DM
- * currently doesn't honor MD's merge_bvec_fn routine.
- * In this case, we'll force DM to use PAGE_SIZE or
- * smaller I/O, just to be safe. A better fix is in the
- * works, but add this for the time being so it will at
- * least operate correctly.
- */
- if (q->merge_bvec_fn)
- rs->max_sectors =
- min_not_zero(rs->max_sectors,
- (unsigned int) (PAGE_SIZE >> 9));
+ rs->max_phys_segments =
+ min_not_zero(rs->max_phys_segments,
+ q->max_phys_segments);
- rs->max_phys_segments =
- min_not_zero(rs->max_phys_segments,
- q->max_phys_segments);
+ rs->max_hw_segments =
+ min_not_zero(rs->max_hw_segments, q->max_hw_segments);
- rs->max_hw_segments =
- min_not_zero(rs->max_hw_segments, q->max_hw_segments);
+ rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
- rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
+ rs->max_segment_size =
+ min_not_zero(rs->max_segment_size, q->max_segment_size);
- rs->max_segment_size =
- min_not_zero(rs->max_segment_size, q->max_segment_size);
+ rs->seg_boundary_mask =
+ min_not_zero(rs->seg_boundary_mask,
+ q->seg_boundary_mask);
- rs->seg_boundary_mask =
- min_not_zero(rs->seg_boundary_mask,
- q->seg_boundary_mask);
+ rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
+}
+EXPORT_SYMBOL_GPL(dm_set_device_limits);
- rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
- }
+int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
+ sector_t len, int mode, struct dm_dev **result)
+{
+ int r = __table_get_device(ti->table, ti, path,
+ start, len, mode, result);
+
+ if (!r)
+ dm_set_device_limits(ti, (*result)->bdev);
return r;
}
@@ -939,9 +944,20 @@ void dm_table_postsuspend_targets(struct dm_table *t)
return suspend_targets(t, 1);
}
-void dm_table_resume_targets(struct dm_table *t)
+int dm_table_resume_targets(struct dm_table *t)
{
- int i;
+ int i, r = 0;
+
+ for (i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = t->targets + i;
+
+ if (!ti->type->preresume)
+ continue;
+
+ r = ti->type->preresume(ti);
+ if (r)
+ return r;
+ }
for (i = 0; i < t->num_targets; i++) {
struct dm_target *ti = t->targets + i;
@@ -949,6 +965,8 @@ void dm_table_resume_targets(struct dm_table *t)
if (ti->type->resume)
ti->type->resume(ti);
}
+
+ return 0;
}
int dm_table_any_congested(struct dm_table *t, int bdi_bits)
@@ -983,6 +1001,11 @@ int dm_table_flush_all(struct dm_table *t)
{
struct list_head *d, *devices = dm_table_get_devices(t);
int ret = 0;
+ unsigned i;
+
+ for (i = 0; i < t->num_targets; i++)
+ if (t->targets[i].type->flush)
+ t->targets[i].type->flush(&t->targets[i]);
for (d = devices->next; d != devices; d = d->next) {
struct dm_dev *dd = list_entry(d, struct dm_dev, list);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c99bf9f017599..b5764a86c8b56 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -20,6 +20,7 @@
#include <linux/idr.h>
#include <linux/hdreg.h>
#include <linux/blktrace_api.h>
+#include <linux/smp_lock.h>
#define DM_MSG_PREFIX "core"
@@ -101,6 +102,8 @@ struct mapped_device {
mempool_t *io_pool;
mempool_t *tio_pool;
+ struct bio_set *bs;
+
/*
* Event handling.
*/
@@ -121,16 +124,10 @@ struct mapped_device {
static kmem_cache_t *_io_cache;
static kmem_cache_t *_tio_cache;
-static struct bio_set *dm_set;
-
static int __init local_init(void)
{
int r;
- dm_set = bioset_create(16, 16, 4);
- if (!dm_set)
- return -ENOMEM;
-
/* allocate a slab for the dm_ios */
_io_cache = kmem_cache_create("dm_io",
sizeof(struct dm_io), 0, 0, NULL, NULL);
@@ -164,8 +161,6 @@ static void local_exit(void)
kmem_cache_destroy(_tio_cache);
kmem_cache_destroy(_io_cache);
- bioset_free(dm_set);
-
if (unregister_blkdev(_major, _name) < 0)
DMERR("unregister_blkdev failed");
@@ -288,6 +283,45 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return dm_get_geometry(md, geo);
}
+static int dm_blk_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ struct mapped_device *md;
+ struct dm_table *map;
+ struct dm_target *tgt;
+ int r = -ENOTTY;
+
+ /* We don't really need this lock, but we do need 'inode'. */
+ unlock_kernel();
+
+ md = inode->i_bdev->bd_disk->private_data;
+
+ map = dm_get_table(md);
+
+ if (!map || !dm_table_get_size(map))
+ goto out;
+
+ /* We only support devices that have a single target */
+ if (dm_table_get_num_targets(map) != 1)
+ goto out;
+
+ tgt = dm_table_get_target(map, 0);
+
+ if (dm_suspended(md)) {
+ r = -EAGAIN;
+ goto out;
+ }
+
+ if (tgt->type->ioctl)
+ r = tgt->type->ioctl(tgt, inode, file, cmd, arg);
+
+out:
+ dm_table_put(map);
+
+ lock_kernel();
+ return r;
+}
+
static inline struct dm_io *alloc_io(struct mapped_device *md)
{
return mempool_alloc(md->io_pool, GFP_NOIO);
@@ -435,7 +469,7 @@ static int clone_endio(struct bio *bio, unsigned int done, int error)
{
int r = 0;
struct target_io *tio = bio->bi_private;
- struct dm_io *io = tio->io;
+ struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
if (bio->bi_size)
@@ -454,9 +488,15 @@ static int clone_endio(struct bio *bio, unsigned int done, int error)
return 1;
}
- free_tio(io->md, tio);
- dec_pending(io, error);
+ dec_pending(tio->io, error);
+
+ /*
+ * Store md for cleanup instead of tio which is about to get freed.
+ */
+ bio->bi_private = md->bs;
+
bio_put(bio);
+ free_tio(md, tio);
return r;
}
@@ -485,6 +525,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
{
int r;
sector_t sector;
+ struct mapped_device *md;
/*
* Sanity checks.
@@ -514,10 +555,14 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
else if (r < 0) {
/* error the io and bail out */
- struct dm_io *io = tio->io;
- free_tio(tio->io->md, tio);
- dec_pending(io, r);
+ md = tio->io->md;
+ dec_pending(tio->io, r);
+ /*
+ * Store bio_set for cleanup.
+ */
+ clone->bi_private = md->bs;
bio_put(clone);
+ free_tio(md, tio);
}
}
@@ -533,7 +578,9 @@ struct clone_info {
static void dm_bio_destructor(struct bio *bio)
{
- bio_free(bio, dm_set);
+ struct bio_set *bs = bio->bi_private;
+
+ bio_free(bio, bs);
}
/*
@@ -541,12 +588,12 @@ static void dm_bio_destructor(struct bio *bio)
*/
static struct bio *split_bvec(struct bio *bio, sector_t sector,
unsigned short idx, unsigned int offset,
- unsigned int len)
+ unsigned int len, struct bio_set *bs)
{
struct bio *clone;
struct bio_vec *bv = bio->bi_io_vec + idx;
- clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set);
+ clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
clone->bi_destructor = dm_bio_destructor;
*clone->bi_io_vec = *bv;
@@ -566,11 +613,13 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
*/
static struct bio *clone_bio(struct bio *bio, sector_t sector,
unsigned short idx, unsigned short bv_count,
- unsigned int len)
+ unsigned int len, struct bio_set *bs)
{
struct bio *clone;
- clone = bio_clone(bio, GFP_NOIO);
+ clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
+ __bio_clone(clone, bio);
+ clone->bi_destructor = dm_bio_destructor;
clone->bi_sector = sector;
clone->bi_idx = idx;
clone->bi_vcnt = idx + bv_count;
@@ -601,7 +650,8 @@ static void __clone_and_map(struct clone_info *ci)
* the remaining io with a single clone.
*/
clone = clone_bio(bio, ci->sector, ci->idx,
- bio->bi_vcnt - ci->idx, ci->sector_count);
+ bio->bi_vcnt - ci->idx, ci->sector_count,
+ ci->md->bs);
__map_bio(ti, clone, tio);
ci->sector_count = 0;
@@ -624,7 +674,8 @@ static void __clone_and_map(struct clone_info *ci)
len += bv_len;
}
- clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
+ clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
+ ci->md->bs);
__map_bio(ti, clone, tio);
ci->sector += len;
@@ -653,7 +704,8 @@ static void __clone_and_map(struct clone_info *ci)
len = min(remaining, max);
clone = split_bvec(bio, ci->sector, ci->idx,
- bv->bv_offset + offset, len);
+ bv->bv_offset + offset, len,
+ ci->md->bs);
__map_bio(ti, clone, tio);
@@ -903,7 +955,7 @@ static struct mapped_device *alloc_dev(int minor)
md->queue = blk_alloc_queue(GFP_KERNEL);
if (!md->queue)
- goto bad1;
+ goto bad1_free_minor;
md->queue->queuedata = md;
md->queue->backing_dev_info.congested_fn = dm_any_congested;
@@ -921,6 +973,10 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->tio_pool)
goto bad3;
+ md->bs = bioset_create(16, 16, 4);
+ if (!md->bs)
+ goto bad_no_bioset;
+
md->disk = alloc_disk(1);
if (!md->disk)
goto bad4;
@@ -948,11 +1004,14 @@ static struct mapped_device *alloc_dev(int minor)
return md;
bad4:
+ bioset_free(md->bs);
+ bad_no_bioset:
mempool_destroy(md->tio_pool);
bad3:
mempool_destroy(md->io_pool);
bad2:
blk_cleanup_queue(md->queue);
+ bad1_free_minor:
free_minor(minor);
bad1:
module_put(THIS_MODULE);
@@ -971,6 +1030,7 @@ static void free_dev(struct mapped_device *md)
}
mempool_destroy(md->tio_pool);
mempool_destroy(md->io_pool);
+ bioset_free(md->bs);
del_gendisk(md->disk);
free_minor(minor);
@@ -1319,7 +1379,9 @@ int dm_resume(struct mapped_device *md)
if (!map || !dm_table_get_size(map))
goto out;
- dm_table_resume_targets(map);
+ r = dm_table_resume_targets(map);
+ if (r)
+ goto out;
down_write(&md->io_lock);
clear_bit(DMF_BLOCK_IO, &md->flags);
@@ -1337,6 +1399,8 @@ int dm_resume(struct mapped_device *md)
dm_table_unplug_all(map);
+ kobject_uevent(&md->disk->kobj, KOBJ_CHANGE);
+
r = 0;
out:
@@ -1377,6 +1441,7 @@ int dm_suspended(struct mapped_device *md)
static struct block_device_operations dm_blk_dops = {
.open = dm_blk_open,
.release = dm_blk_close,
+ .ioctl = dm_blk_ioctl,
.getgeo = dm_blk_getgeo,
.owner = THIS_MODULE
};
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 3c03c0ecab7e4..a48ec5e3c1f47 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -21,6 +21,11 @@
#define DMERR(f, arg...) printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
#define DMWARN(f, arg...) printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
#define DMINFO(f, arg...) printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
+#ifdef CONFIG_DM_DEBUG
+# define DMDEBUG(f, arg...) printk(KERN_DEBUG DM_NAME ": " DM_MSG_PREFIX " DEBUG: " f "\n", ## arg)
+#else
+# define DMDEBUG(f, arg...) do {} while (0)
+#endif
#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
0 : scnprintf(result + sz, maxlen - sz, x))
@@ -52,7 +57,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
struct list_head *dm_table_get_devices(struct dm_table *t);
void dm_table_presuspend_targets(struct dm_table *t);
void dm_table_postsuspend_targets(struct dm_table *t);
-void dm_table_resume_targets(struct dm_table *t);
+int dm_table_resume_targets(struct dm_table *t);
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
void dm_table_unplug_all(struct dm_table *t);
int dm_table_flush_all(struct dm_table *t);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index b99c19c7eb223..c625ddb8833d8 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -111,6 +111,19 @@ static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
return ret;
}
+static int linear_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+ linear_conf_t *conf = mddev_to_conf(mddev);
+ int i, ret = 0;
+
+ for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+ request_queue_t *q = bdev_get_queue(conf->disks[i].rdev->bdev);
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ }
+ return ret;
+}
+
static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
{
linear_conf_t *conf;
@@ -269,6 +282,8 @@ static int linear_run (mddev_t *mddev)
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
mddev->queue->unplug_fn = linear_unplug;
mddev->queue->issue_flush_fn = linear_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = linear_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
return 0;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8dbab2ef38857..cb8281605be8f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -389,8 +389,12 @@ static int super_written(struct bio *bio, unsigned int bytes_done, int error)
if (bio->bi_size)
return 1;
- if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
+ if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ printk("md: super_written gets error=%d, uptodate=%d\n",
+ error, test_bit(BIO_UPTODATE, &bio->bi_flags));
+ WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
md_error(mddev, rdev);
+ }
if (atomic_dec_and_test(&mddev->pending_writes))
wake_up(&mddev->sb_wait);
@@ -1587,7 +1591,7 @@ static void sync_sbs(mddev_t * mddev, int nospares)
}
}
-void md_update_sb(mddev_t * mddev)
+static void md_update_sb(mddev_t * mddev, int force_change)
{
int err;
struct list_head *tmp;
@@ -1598,7 +1602,18 @@ void md_update_sb(mddev_t * mddev)
repeat:
spin_lock_irq(&mddev->write_lock);
- if (mddev->degraded && mddev->sb_dirty == 3)
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
+ if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
+ force_change = 1;
+ if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
+ /* just a clean<-> dirty transition, possibly leave spares alone,
+ * though if events isn't the right even/odd, we will have to do
+ * spares after all
+ */
+ nospares = 1;
+ if (force_change)
+ nospares = 0;
+ if (mddev->degraded)
/* If the array is degraded, then skipping spares is both
* dangerous and fairly pointless.
* Dangerous because a device that was removed from the array
@@ -1608,20 +1623,14 @@ repeat:
* then a recovery will happen and soon that array won't
* be degraded any more and the spare can go back to sleep then.
*/
- mddev->sb_dirty = 1;
+ nospares = 0;
sync_req = mddev->in_sync;
mddev->utime = get_seconds();
- if (mddev->sb_dirty == 3)
- /* just a clean<-> dirty transition, possibly leave spares alone,
- * though if events isn't the right even/odd, we will have to do
- * spares after all
- */
- nospares = 1;
/* If this is just a dirty<->clean transition, and the array is clean
* and 'events' is odd, we can roll back to the previous clean state */
- if (mddev->sb_dirty == 3
+ if (nospares
&& (mddev->in_sync && mddev->recovery_cp == MaxSector)
&& (mddev->events & 1))
mddev->events--;
@@ -1652,7 +1661,6 @@ repeat:
MD_BUG();
mddev->events --;
}
- mddev->sb_dirty = 2;
sync_sbs(mddev, nospares);
/*
@@ -1660,7 +1668,7 @@ repeat:
* nonpersistent superblocks
*/
if (!mddev->persistent) {
- mddev->sb_dirty = 0;
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
spin_unlock_irq(&mddev->write_lock);
wake_up(&mddev->sb_wait);
return;
@@ -1697,20 +1705,20 @@ repeat:
break;
}
md_super_wait(mddev);
- /* if there was a failure, sb_dirty was set to 1, and we re-write super */
+ /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
spin_lock_irq(&mddev->write_lock);
- if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
+ if (mddev->in_sync != sync_req ||
+ test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
/* have to write it out again */
spin_unlock_irq(&mddev->write_lock);
goto repeat;
}
- mddev->sb_dirty = 0;
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
spin_unlock_irq(&mddev->write_lock);
wake_up(&mddev->sb_wait);
}
-EXPORT_SYMBOL_GPL(md_update_sb);
/* words written to sysfs files may, or my not, be \n terminated.
* We want to accept with case. For this we use cmd_match.
@@ -1783,7 +1791,7 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
else {
mddev_t *mddev = rdev->mddev;
kick_rdev_from_array(rdev);
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
md_new_event(mddev);
err = 0;
}
@@ -2426,7 +2434,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
spin_lock_irq(&mddev->write_lock);
if (atomic_read(&mddev->writes_pending) == 0) {
mddev->in_sync = 1;
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
}
spin_unlock_irq(&mddev->write_lock);
} else {
@@ -2438,7 +2446,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
case active:
if (mddev->pers) {
restart_array(mddev);
- mddev->sb_dirty = 0;
+ clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
wake_up(&mddev->sb_wait);
err = 0;
} else {
@@ -2520,6 +2528,36 @@ static struct md_sysfs_entry md_new_device =
__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
static ssize_t
+bitmap_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ char *end;
+ unsigned long chunk, end_chunk;
+
+ if (!mddev->bitmap)
+ goto out;
+ /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
+ while (*buf) {
+ chunk = end_chunk = simple_strtoul(buf, &end, 0);
+ if (buf == end) break;
+ if (*end == '-') { /* range */
+ buf = end + 1;
+ end_chunk = simple_strtoul(buf, &end, 0);
+ if (buf == end) break;
+ }
+ if (*end && !isspace(*end)) break;
+ bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
+ buf = end;
+ while (isspace(*buf)) buf++;
+ }
+ bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
+out:
+ return len;
+}
+
+static struct md_sysfs_entry md_bitmap =
+__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
+
+static ssize_t
size_show(mddev_t *mddev, char *page)
{
return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
@@ -2543,7 +2581,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
if (mddev->pers) {
err = update_size(mddev, size);
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
} else {
if (mddev->size == 0 ||
mddev->size > size)
@@ -2839,6 +2877,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_sync_completed.attr,
&md_suspend_lo.attr,
&md_suspend_hi.attr,
+ &md_bitmap.attr,
NULL,
};
static struct attribute_group md_redundancy_group = {
@@ -3111,8 +3150,8 @@ static int do_md_run(mddev_t * mddev)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- if (mddev->sb_dirty)
- md_update_sb(mddev);
+ if (mddev->flags)
+ md_update_sb(mddev, 0);
set_capacity(disk, mddev->array_size<<1);
@@ -3275,10 +3314,10 @@ static int do_md_stop(mddev_t * mddev, int mode)
if (mddev->ro)
mddev->ro = 0;
}
- if (!mddev->in_sync || mddev->sb_dirty) {
+ if (!mddev->in_sync || mddev->flags) {
/* mark array as shutdown cleanly */
mddev->in_sync = 1;
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
}
if (mode == 1)
set_disk_ro(disk, 1);
@@ -3374,6 +3413,7 @@ static void autorun_devices(int part)
printk(KERN_INFO "md: autorun ...\n");
while (!list_empty(&pending_raid_disks)) {
+ int unit;
dev_t dev;
LIST_HEAD(candidates);
rdev0 = list_entry(pending_raid_disks.next,
@@ -3393,16 +3433,19 @@ static void autorun_devices(int part)
* mostly sane superblocks. It's time to allocate the
* mddev.
*/
- if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
+ if (part) {
+ dev = MKDEV(mdp_major,
+ rdev0->preferred_minor << MdpMinorShift);
+ unit = MINOR(dev) >> MdpMinorShift;
+ } else {
+ dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
+ unit = MINOR(dev);
+ }
+ if (rdev0->preferred_minor != unit) {
printk(KERN_INFO "md: unit number in %s is bad: %d\n",
bdevname(rdev0->bdev, b), rdev0->preferred_minor);
break;
}
- if (part)
- dev = MKDEV(mdp_major,
- rdev0->preferred_minor << MdpMinorShift);
- else
- dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
md_probe(dev, NULL, NULL);
mddev = mddev_find(dev);
@@ -3440,67 +3483,6 @@ static void autorun_devices(int part)
printk(KERN_INFO "md: ... autorun DONE.\n");
}
-/*
- * import RAID devices based on one partition
- * if possible, the array gets run as well.
- */
-
-static int autostart_array(dev_t startdev)
-{
- char b[BDEVNAME_SIZE];
- int err = -EINVAL, i;
- mdp_super_t *sb = NULL;
- mdk_rdev_t *start_rdev = NULL, *rdev;
-
- start_rdev = md_import_device(startdev, 0, 0);
- if (IS_ERR(start_rdev))
- return err;
-
-
- /* NOTE: this can only work for 0.90.0 superblocks */
- sb = (mdp_super_t*)page_address(start_rdev->sb_page);
- if (sb->major_version != 0 ||
- sb->minor_version != 90 ) {
- printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
- export_rdev(start_rdev);
- return err;
- }
-
- if (test_bit(Faulty, &start_rdev->flags)) {
- printk(KERN_WARNING
- "md: can not autostart based on faulty %s!\n",
- bdevname(start_rdev->bdev,b));
- export_rdev(start_rdev);
- return err;
- }
- list_add(&start_rdev->same_set, &pending_raid_disks);
-
- for (i = 0; i < MD_SB_DISKS; i++) {
- mdp_disk_t *desc = sb->disks + i;
- dev_t dev = MKDEV(desc->major, desc->minor);
-
- if (!dev)
- continue;
- if (dev == startdev)
- continue;
- if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
- continue;
- rdev = md_import_device(dev, 0, 0);
- if (IS_ERR(rdev))
- continue;
-
- list_add(&rdev->same_set, &pending_raid_disks);
- }
-
- /*
- * possibly return codes
- */
- autorun_devices(0);
- return 0;
-
-}
-
-
static int get_version(void __user * arg)
{
mdu_version_t ver;
@@ -3808,7 +3790,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
goto busy;
kick_rdev_from_array(rdev);
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
md_new_event(mddev);
return 0;
@@ -3885,7 +3867,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
rdev->raid_disk = -1;
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
/*
* Kick recovery, maybe this spare has to be added to the
@@ -4016,7 +3998,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
mddev->max_disks = MD_SB_DISKS;
- mddev->sb_dirty = 1;
+ mddev->flags = 0;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
mddev->bitmap_offset = 0;
@@ -4185,7 +4168,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
mddev->bitmap_offset = 0;
}
}
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
return rv;
}
@@ -4259,27 +4242,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
goto abort;
}
-
- if (cmd == START_ARRAY) {
- /* START_ARRAY doesn't need to lock the array as autostart_array
- * does the locking, and it could even be a different array
- */
- static int cnt = 3;
- if (cnt > 0 ) {
- printk(KERN_WARNING
- "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
- "This will not be supported beyond July 2006\n",
- current->comm, current->pid);
- cnt--;
- }
- err = autostart_array(new_decode_dev(arg));
- if (err) {
- printk(KERN_WARNING "md: autostart failed!\n");
- goto abort;
- }
- goto done;
- }
-
err = mddev_lock(mddev);
if (err) {
printk(KERN_INFO
@@ -4476,8 +4438,7 @@ static int md_release(struct inode *inode, struct file * file)
{
mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
- if (!mddev)
- BUG();
+ BUG_ON(!mddev);
mddev_put(mddev);
return 0;
@@ -4687,9 +4648,11 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
(test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
"reshape" :
- (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
- "resync" : "recovery")),
- per_milli/10, per_milli % 10,
+ (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
+ "check" :
+ (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+ "resync" : "recovery"))),
+ per_milli/10, per_milli % 10,
(unsigned long long) resync,
(unsigned long long) max_blocks);
@@ -5042,12 +5005,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
spin_lock_irq(&mddev->write_lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
- mddev->sb_dirty = 3;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
md_wakeup_thread(mddev->thread);
}
spin_unlock_irq(&mddev->write_lock);
}
- wait_event(mddev->sb_wait, mddev->sb_dirty==0);
+ wait_event(mddev->sb_wait, mddev->flags==0);
}
void md_write_end(mddev_t *mddev)
@@ -5078,6 +5041,7 @@ void md_do_sync(mddev_t *mddev)
int skipped = 0;
struct list_head *rtmp;
mdk_rdev_t *rdev;
+ char *desc;
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -5085,6 +5049,18 @@ void md_do_sync(mddev_t *mddev)
if (mddev->ro) /* never try to sync a read-only array */
return;
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+ desc = "data-check";
+ else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ desc = "requested-resync";
+ else
+ desc = "resync";
+ } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ desc = "reshape";
+ else
+ desc = "recovery";
+
/* we overload curr_resync somewhat here.
* 0 == not engaged in resync at all
* 2 == checking that there is no conflict with another sync
@@ -5128,10 +5104,10 @@ void md_do_sync(mddev_t *mddev)
prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
if (!kthread_should_stop() &&
mddev2->curr_resync >= mddev->curr_resync) {
- printk(KERN_INFO "md: delaying resync of %s"
- " until %s has finished resync (they"
+ printk(KERN_INFO "md: delaying %s of %s"
+ " until %s has finished (they"
" share one or more physical units)\n",
- mdname(mddev), mdname(mddev2));
+ desc, mdname(mddev), mdname(mddev2));
mddev_put(mddev2);
schedule();
finish_wait(&resync_wait, &wq);
@@ -5167,12 +5143,12 @@ void md_do_sync(mddev_t *mddev)
j = rdev->recovery_offset;
}
- printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
- printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
- " %d KB/sec/disc.\n", speed_min(mddev));
+ printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ speed:"
+ " %d KB/sec/disk.\n", speed_min(mddev));
printk(KERN_INFO "md: using maximum available idle IO bandwidth "
- "(but not more than %d KB/sec) for reconstruction.\n",
- speed_max(mddev));
+ "(but not more than %d KB/sec) for %s.\n",
+ speed_max(mddev), desc);
is_mddev_idle(mddev); /* this also initializes IO event counters */
@@ -5198,8 +5174,8 @@ void md_do_sync(mddev_t *mddev)
if (j>2) {
printk(KERN_INFO
- "md: resuming recovery of %s from checkpoint.\n",
- mdname(mddev));
+ "md: resuming %s of %s from checkpoint.\n",
+ desc, mdname(mddev));
mddev->curr_resync = j;
}
@@ -5282,7 +5258,7 @@ void md_do_sync(mddev_t *mddev)
}
}
}
- printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
+ printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
/*
* this also signals 'finished resyncing' to md_stop
*/
@@ -5302,8 +5278,8 @@ void md_do_sync(mddev_t *mddev)
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
if (mddev->curr_resync >= mddev->recovery_cp) {
printk(KERN_INFO
- "md: checkpointing recovery of %s.\n",
- mdname(mddev));
+ "md: checkpointing %s of %s.\n",
+ desc, mdname(mddev));
mddev->recovery_cp = mddev->curr_resync;
}
} else
@@ -5317,7 +5293,6 @@ void md_do_sync(mddev_t *mddev)
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < mddev->curr_resync)
rdev->recovery_offset = mddev->curr_resync;
- mddev->sb_dirty = 1;
}
}
@@ -5374,7 +5349,7 @@ void md_check_recovery(mddev_t *mddev)
}
if ( ! (
- mddev->sb_dirty ||
+ mddev->flags ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
(mddev->safemode == 1) ||
@@ -5390,14 +5365,14 @@ void md_check_recovery(mddev_t *mddev)
if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
!mddev->in_sync && mddev->recovery_cp == MaxSector) {
mddev->in_sync = 1;
- mddev->sb_dirty = 3;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
}
if (mddev->safemode == 1)
mddev->safemode = 0;
spin_unlock_irq(&mddev->write_lock);
- if (mddev->sb_dirty)
- md_update_sb(mddev);
+ if (mddev->flags)
+ md_update_sb(mddev, 0);
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
@@ -5416,7 +5391,7 @@ void md_check_recovery(mddev_t *mddev)
/* activate any spares */
mddev->pers->spare_active(mddev);
}
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
/* if array is no-longer degraded, then any saved_raid_disk
* information must be scrapped
@@ -5556,22 +5531,15 @@ static void md_geninit(void)
static int __init md_init(void)
{
- printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
- " MD_SB_DISKS=%d\n",
- MD_MAJOR_VERSION, MD_MINOR_VERSION,
- MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
- printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
- BITMAP_MINOR);
-
if (register_blkdev(MAJOR_NR, "md"))
return -1;
if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
unregister_blkdev(MAJOR_NR, "md");
return -1;
}
- blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
- md_probe, NULL, NULL);
- blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
+ blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
+ md_probe, NULL, NULL);
+ blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
md_probe, NULL, NULL);
register_reboot_notifier(&md_notifier);
@@ -5630,8 +5598,8 @@ static __exit void md_exit(void)
mddev_t *mddev;
struct list_head *tmp;
- blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
- blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
+ blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
+ blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
unregister_blkdev(MAJOR_NR,"md");
unregister_blkdev(mdp_major, "mdp");
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 1cc9de44ce86c..171ff41b52b05 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -228,6 +228,28 @@ static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
rcu_read_unlock();
return ret;
}
+static int multipath_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+ multipath_conf_t *conf = mddev_to_conf(mddev);
+ int i, ret = 0;
+
+ rcu_read_lock();
+ for (i = 0; i < mddev->raid_disks ; i++) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ request_queue_t *q = bdev_get_queue(rdev->bdev);
+
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ /* Just like multipath_map, we just check the
+ * first available device
+ */
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
/*
* Careful, this can execute in IRQ contexts as well!
@@ -253,7 +275,7 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
char b[BDEVNAME_SIZE];
clear_bit(In_sync, &rdev->flags);
set_bit(Faulty, &rdev->flags);
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
conf->working_disks--;
printk(KERN_ALERT "multipath: IO failure on %s,"
" disabling IO path. \n Operation continuing"
@@ -470,7 +492,6 @@ static int multipath_run (mddev_t *mddev)
}
conf->raid_disks = mddev->raid_disks;
- mddev->sb_dirty = 1;
conf->mddev = mddev;
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
@@ -510,6 +531,8 @@ static int multipath_run (mddev_t *mddev)
mddev->queue->unplug_fn = multipath_unplug;
mddev->queue->issue_flush_fn = multipath_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = multipath_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
return 0;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index cb8c6317e4e5f..dfe32149ad3ae 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -60,6 +60,21 @@ static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk,
return ret;
}
+static int raid0_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+ raid0_conf_t *conf = mddev_to_conf(mddev);
+ mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+ int i, ret = 0;
+
+ for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+ request_queue_t *q = bdev_get_queue(devlist[i]->bdev);
+
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ }
+ return ret;
+}
+
static int create_strip_zones (mddev_t *mddev)
{
@@ -236,6 +251,8 @@ static int create_strip_zones (mddev_t *mddev)
mddev->queue->unplug_fn = raid0_unplug;
mddev->queue->issue_flush_fn = raid0_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = raid0_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
printk("raid0: done.\n");
return 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3b4d69c056230..dc9d2def0270d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -271,7 +271,7 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
*/
update_head_pos(mirror, r1_bio);
- if (uptodate || conf->working_disks <= 1) {
+ if (uptodate || (conf->raid_disks - conf->mddev->degraded) <= 1) {
/*
* Set R1BIO_Uptodate in our master bio, so that
* we will return a good error code for to the higher
@@ -601,6 +601,32 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
return ret;
}
+static int raid1_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+ conf_t *conf = mddev_to_conf(mddev);
+ int i, ret = 0;
+
+ rcu_read_lock();
+ for (i = 0; i < mddev->raid_disks; i++) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ request_queue_t *q = bdev_get_queue(rdev->bdev);
+
+ /* Note the '|| 1' - when read_balance prefers
+ * non-congested targets, it can be removed
+ */
+ if ((bits & (1<<BDI_write_congested)) || 1)
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ else
+ ret &= bdi_congested(&q->backing_dev_info, bits);
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+
/* Barriers....
* Sometimes we need to suspend IO while we do something else,
* either some resync/recovery, or reconfigure the array.
@@ -929,7 +955,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
int i;
seq_printf(seq, " [%d/%d] [", conf->raid_disks,
- conf->working_disks);
+ conf->raid_disks - mddev->degraded);
rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
@@ -953,26 +979,27 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
* else mark the drive as failed
*/
if (test_bit(In_sync, &rdev->flags)
- && conf->working_disks == 1)
+ && (conf->raid_disks - mddev->degraded) == 1)
/*
* Don't fail the drive, act as though we were just a
* normal single drive
*/
return;
- if (test_bit(In_sync, &rdev->flags)) {
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
- conf->working_disks--;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery is running, make sure it aborts.
*/
set_bit(MD_RECOVERY_ERR, &mddev->recovery);
}
- clear_bit(In_sync, &rdev->flags);
set_bit(Faulty, &rdev->flags);
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
" Operation continuing on %d devices\n",
- bdevname(rdev->bdev,b), conf->working_disks);
+ bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
}
static void print_conf(conf_t *conf)
@@ -984,7 +1011,7 @@ static void print_conf(conf_t *conf)
printk("(!conf)\n");
return;
}
- printk(" --- wd:%d rd:%d\n", conf->working_disks,
+ printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
conf->raid_disks);
rcu_read_lock();
@@ -1023,10 +1050,11 @@ static int raid1_spare_active(mddev_t *mddev)
mdk_rdev_t *rdev = conf->mirrors[i].rdev;
if (rdev
&& !test_bit(Faulty, &rdev->flags)
- && !test_bit(In_sync, &rdev->flags)) {
- conf->working_disks++;
+ && !test_and_set_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded--;
- set_bit(In_sync, &rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
}
}
@@ -1368,6 +1396,95 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
* 3. Performs writes following reads for array syncronising.
*/
+static void fix_read_error(conf_t *conf, int read_disk,
+ sector_t sect, int sectors)
+{
+ mddev_t *mddev = conf->mddev;
+ while(sectors) {
+ int s = sectors;
+ int d = read_disk;
+ int success = 0;
+ int start;
+ mdk_rdev_t *rdev;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+
+ do {
+ /* Note: no rcu protection needed here
+ * as this is synchronous in the raid1d thread
+ * which is the thread that might remove
+ * a device. If raid1d ever becomes multi-threaded....
+ */
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags) &&
+ sync_page_io(rdev->bdev,
+ sect + rdev->data_offset,
+ s<<9,
+ conf->tmppage, READ))
+ success = 1;
+ else {
+ d++;
+ if (d == conf->raid_disks)
+ d = 0;
+ }
+ } while (!success && d != read_disk);
+
+ if (!success) {
+ /* Cannot read from anywhere -- bye bye array */
+ md_error(mddev, conf->mirrors[read_disk].rdev);
+ break;
+ }
+ /* write it back and re-read */
+ start = d;
+ while (d != read_disk) {
+ if (d==0)
+ d = conf->raid_disks;
+ d--;
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ if (sync_page_io(rdev->bdev,
+ sect + rdev->data_offset,
+ s<<9, conf->tmppage, WRITE)
+ == 0)
+ /* Well, this device is dead */
+ md_error(mddev, rdev);
+ }
+ }
+ d = start;
+ while (d != read_disk) {
+ char b[BDEVNAME_SIZE];
+ if (d==0)
+ d = conf->raid_disks;
+ d--;
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ if (sync_page_io(rdev->bdev,
+ sect + rdev->data_offset,
+ s<<9, conf->tmppage, READ)
+ == 0)
+ /* Well, this device is dead */
+ md_error(mddev, rdev);
+ else {
+ atomic_add(s, &rdev->corrected_errors);
+ printk(KERN_INFO
+ "raid1:%s: read error corrected "
+ "(%d sectors at %llu on %s)\n",
+ mdname(mddev), s,
+ (unsigned long long)sect +
+ rdev->data_offset,
+ bdevname(rdev->bdev, b));
+ }
+ }
+ }
+ sectors -= s;
+ sect += s;
+ }
+}
+
static void raid1d(mddev_t *mddev)
{
r1bio_t *r1_bio;
@@ -1460,86 +1577,14 @@ static void raid1d(mddev_t *mddev)
* This is all done synchronously while the array is
* frozen
*/
- sector_t sect = r1_bio->sector;
- int sectors = r1_bio->sectors;
- freeze_array(conf);
- if (mddev->ro == 0) while(sectors) {
- int s = sectors;
- int d = r1_bio->read_disk;
- int success = 0;
-
- if (s > (PAGE_SIZE>>9))
- s = PAGE_SIZE >> 9;
-
- do {
- /* Note: no rcu protection needed here
- * as this is synchronous in the raid1d thread
- * which is the thread that might remove
- * a device. If raid1d ever becomes multi-threaded....
- */
- rdev = conf->mirrors[d].rdev;
- if (rdev &&
- test_bit(In_sync, &rdev->flags) &&
- sync_page_io(rdev->bdev,
- sect + rdev->data_offset,
- s<<9,
- conf->tmppage, READ))
- success = 1;
- else {
- d++;
- if (d == conf->raid_disks)
- d = 0;
- }
- } while (!success && d != r1_bio->read_disk);
-
- if (success) {
- /* write it back and re-read */
- int start = d;
- while (d != r1_bio->read_disk) {
- if (d==0)
- d = conf->raid_disks;
- d--;
- rdev = conf->mirrors[d].rdev;
- if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev->bdev,
- sect + rdev->data_offset,
- s<<9, conf->tmppage, WRITE) == 0)
- /* Well, this device is dead */
- md_error(mddev, rdev);
- }
- }
- d = start;
- while (d != r1_bio->read_disk) {
- if (d==0)
- d = conf->raid_disks;
- d--;
- rdev = conf->mirrors[d].rdev;
- if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev->bdev,
- sect + rdev->data_offset,
- s<<9, conf->tmppage, READ) == 0)
- /* Well, this device is dead */
- md_error(mddev, rdev);
- else {
- atomic_add(s, &rdev->corrected_errors);
- printk(KERN_INFO "raid1:%s: read error corrected (%d sectors at %llu on %s)\n",
- mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), bdevname(rdev->bdev, b));
- }
- }
- }
- } else {
- /* Cannot read from anywhere -- bye bye array */
- md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
- break;
- }
- sectors -= s;
- sect += s;
+ if (mddev->ro == 0) {
+ freeze_array(conf);
+ fix_read_error(conf, r1_bio->read_disk,
+ r1_bio->sector,
+ r1_bio->sectors);
+ unfreeze_array(conf);
}
- unfreeze_array(conf);
-
bio = r1_bio->bios[r1_bio->read_disk];
if ((disk=read_balance(conf, r1_bio)) == -1) {
printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -1884,15 +1929,11 @@ static int run(mddev_t *mddev)
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
disk->head_position = 0;
- if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
- conf->working_disks++;
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
- if (conf->working_disks == 1)
- mddev->recovery_cp = MaxSector;
spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier);
@@ -1900,11 +1941,6 @@ static int run(mddev_t *mddev)
bio_list_init(&conf->pending_bio_list);
bio_list_init(&conf->flushing_bio_list);
- if (!conf->working_disks) {
- printk(KERN_ERR "raid1: no operational mirrors for %s\n",
- mdname(mddev));
- goto out_free_conf;
- }
mddev->degraded = 0;
for (i = 0; i < conf->raid_disks; i++) {
@@ -1917,6 +1953,13 @@ static int run(mddev_t *mddev)
mddev->degraded++;
}
}
+ if (mddev->degraded == conf->raid_disks) {
+ printk(KERN_ERR "raid1: no operational mirrors for %s\n",
+ mdname(mddev));
+ goto out_free_conf;
+ }
+ if (conf->raid_disks - mddev->degraded == 1)
+ mddev->recovery_cp = MaxSector;
/*
* find the first working one and use it as a starting point
@@ -1948,6 +1991,8 @@ static int run(mddev_t *mddev)
mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->issue_flush_fn = raid1_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = raid1_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
return 0;
@@ -2035,7 +2080,7 @@ static int raid1_reshape(mddev_t *mddev)
mirror_info_t *newmirrors;
conf_t *conf = mddev_to_conf(mddev);
int cnt, raid_disks;
-
+ unsigned long flags;
int d, d2;
/* Cannot change chunk_size, layout, or level */
@@ -2094,7 +2139,9 @@ static int raid1_reshape(mddev_t *mddev)
kfree(conf->poolinfo);
conf->poolinfo = newpoolinfo;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded += (raid_disks - conf->raid_disks);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
conf->raid_disks = mddev->raid_disks = raid_disks;
mddev->delta_disks = 0;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 016ddb831c9b3..1250f0eab4afa 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -648,6 +648,26 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
return ret;
}
+static int raid10_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+ conf_t *conf = mddev_to_conf(mddev);
+ int i, ret = 0;
+
+ rcu_read_lock();
+ for (i = 0; i < mddev->raid_disks && ret == 0; i++) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ request_queue_t *q = bdev_get_queue(rdev->bdev);
+
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+
/* Barriers....
* Sometimes we need to suspend IO while we do something else,
* either some resync/recovery, or reconfigure the array.
@@ -921,7 +941,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
seq_printf(seq, " %d far-copies", conf->far_copies);
}
seq_printf(seq, " [%d/%d] [", conf->raid_disks,
- conf->working_disks);
+ conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++)
seq_printf(seq, "%s",
conf->mirrors[i].rdev &&
@@ -941,7 +961,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
* else mark the drive as failed
*/
if (test_bit(In_sync, &rdev->flags)
- && conf->working_disks == 1)
+ && conf->raid_disks-mddev->degraded == 1)
/*
* Don't fail the drive, just return an IO error.
* The test should really be more sophisticated than
@@ -950,20 +970,21 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
* really dead" tests...
*/
return;
- if (test_bit(In_sync, &rdev->flags)) {
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
- conf->working_disks--;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery is running, make sure it aborts.
*/
set_bit(MD_RECOVERY_ERR, &mddev->recovery);
}
- clear_bit(In_sync, &rdev->flags);
set_bit(Faulty, &rdev->flags);
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"
" Operation continuing on %d devices\n",
- bdevname(rdev->bdev,b), conf->working_disks);
+ bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
}
static void print_conf(conf_t *conf)
@@ -976,7 +997,7 @@ static void print_conf(conf_t *conf)
printk("(!conf)\n");
return;
}
- printk(" --- wd:%d rd:%d\n", conf->working_disks,
+ printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
conf->raid_disks);
for (i = 0; i < conf->raid_disks; i++) {
@@ -1034,10 +1055,11 @@ static int raid10_spare_active(mddev_t *mddev)
tmp = conf->mirrors + i;
if (tmp->rdev
&& !test_bit(Faulty, &tmp->rdev->flags)
- && !test_bit(In_sync, &tmp->rdev->flags)) {
- conf->working_disks++;
+ && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded--;
- set_bit(In_sync, &tmp->rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
}
}
@@ -1350,9 +1372,119 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
*
* 1. Retries failed read operations on working mirrors.
* 2. Updates the raid superblock when problems encounter.
- * 3. Performs writes following reads for array syncronising.
+ * 3. Performs writes following reads for array synchronising.
*/
+static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
+{
+ int sect = 0; /* Offset from r10_bio->sector */
+ int sectors = r10_bio->sectors;
+ mdk_rdev_t*rdev;
+ while(sectors) {
+ int s = sectors;
+ int sl = r10_bio->read_slot;
+ int success = 0;
+ int start;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+
+ rcu_read_lock();
+ do {
+ int d = r10_bio->devs[sl].devnum;
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ success = sync_page_io(rdev->bdev,
+ r10_bio->devs[sl].addr +
+ sect + rdev->data_offset,
+ s<<9,
+ conf->tmppage, READ);
+ rdev_dec_pending(rdev, mddev);
+ rcu_read_lock();
+ if (success)
+ break;
+ }
+ sl++;
+ if (sl == conf->copies)
+ sl = 0;
+ } while (!success && sl != r10_bio->read_slot);
+ rcu_read_unlock();
+
+ if (!success) {
+ /* Cannot read from anywhere -- bye bye array */
+ int dn = r10_bio->devs[r10_bio->read_slot].devnum;
+ md_error(mddev, conf->mirrors[dn].rdev);
+ break;
+ }
+
+ start = sl;
+ /* write it back and re-read */
+ rcu_read_lock();
+ while (sl != r10_bio->read_slot) {
+ int d;
+ if (sl==0)
+ sl = conf->copies;
+ sl--;
+ d = r10_bio->devs[sl].devnum;
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ atomic_add(s, &rdev->corrected_errors);
+ if (sync_page_io(rdev->bdev,
+ r10_bio->devs[sl].addr +
+ sect + rdev->data_offset,
+ s<<9, conf->tmppage, WRITE)
+ == 0)
+ /* Well, this device is dead */
+ md_error(mddev, rdev);
+ rdev_dec_pending(rdev, mddev);
+ rcu_read_lock();
+ }
+ }
+ sl = start;
+ while (sl != r10_bio->read_slot) {
+ int d;
+ if (sl==0)
+ sl = conf->copies;
+ sl--;
+ d = r10_bio->devs[sl].devnum;
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ char b[BDEVNAME_SIZE];
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ if (sync_page_io(rdev->bdev,
+ r10_bio->devs[sl].addr +
+ sect + rdev->data_offset,
+ s<<9, conf->tmppage, READ) == 0)
+ /* Well, this device is dead */
+ md_error(mddev, rdev);
+ else
+ printk(KERN_INFO
+ "raid10:%s: read error corrected"
+ " (%d sectors at %llu on %s)\n",
+ mdname(mddev), s,
+ (unsigned long long)sect+
+ rdev->data_offset,
+ bdevname(rdev->bdev, b));
+
+ rdev_dec_pending(rdev, mddev);
+ rcu_read_lock();
+ }
+ }
+ rcu_read_unlock();
+
+ sectors -= s;
+ sect += s;
+ }
+}
+
static void raid10d(mddev_t *mddev)
{
r10bio_t *r10_bio;
@@ -1413,105 +1545,12 @@ static void raid10d(mddev_t *mddev)
* This is all done synchronously while the array is
* frozen.
*/
- int sect = 0; /* Offset from r10_bio->sector */
- int sectors = r10_bio->sectors;
- freeze_array(conf);
- if (mddev->ro == 0) while(sectors) {
- int s = sectors;
- int sl = r10_bio->read_slot;
- int success = 0;
-
- if (s > (PAGE_SIZE>>9))
- s = PAGE_SIZE >> 9;
-
- rcu_read_lock();
- do {
- int d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- success = sync_page_io(rdev->bdev,
- r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
- s<<9,
- conf->tmppage, READ);
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
- if (success)
- break;
- }
- sl++;
- if (sl == conf->copies)
- sl = 0;
- } while (!success && sl != r10_bio->read_slot);
- rcu_read_unlock();
-
- if (success) {
- int start = sl;
- /* write it back and re-read */
- rcu_read_lock();
- while (sl != r10_bio->read_slot) {
- int d;
- if (sl==0)
- sl = conf->copies;
- sl--;
- d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- atomic_add(s, &rdev->corrected_errors);
- if (sync_page_io(rdev->bdev,
- r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
- s<<9, conf->tmppage, WRITE) == 0)
- /* Well, this device is dead */
- md_error(mddev, rdev);
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
- }
- }
- sl = start;
- while (sl != r10_bio->read_slot) {
- int d;
- if (sl==0)
- sl = conf->copies;
- sl--;
- d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- if (sync_page_io(rdev->bdev,
- r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
- s<<9, conf->tmppage, READ) == 0)
- /* Well, this device is dead */
- md_error(mddev, rdev);
- else
- printk(KERN_INFO "raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
- mdname(mddev), s, (unsigned long long)(sect+rdev->data_offset), bdevname(rdev->bdev, b));
-
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
- }
- }
- rcu_read_unlock();
- } else {
- /* Cannot read from anywhere -- bye bye array */
- md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev);
- break;
- }
- sectors -= s;
- sect += s;
+ if (mddev->ro == 0) {
+ freeze_array(conf);
+ fix_read_error(conf, mddev, r10_bio);
+ unfreeze_array(conf);
}
- unfreeze_array(conf);
-
bio = r10_bio->devs[r10_bio->read_slot].bio;
r10_bio->devs[r10_bio->read_slot].bio =
mddev->ro ? IO_BLOCKED : NULL;
@@ -2018,8 +2057,6 @@ static int run(mddev_t *mddev)
mddev->queue->max_sectors = (PAGE_SIZE>>9);
disk->head_position = 0;
- if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
- conf->working_disks++;
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
@@ -2077,6 +2114,8 @@ static int run(mddev_t *mddev)
mddev->queue->unplug_fn = raid10_unplug;
mddev->queue->issue_flush_fn = raid10_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = raid10_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
/* Calculate max read-ahead size.
* We need to readahead at least twice a whole stripe....
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 450066007160c..e14f457807200 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -636,7 +636,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
struct stripe_head *sh = bi->bi_private;
raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks, i;
- unsigned long flags;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
if (bi->bi_size)
@@ -654,7 +653,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
return 0;
}
- spin_lock_irqsave(&conf->device_lock, flags);
if (!uptodate)
md_error(conf->mddev, conf->disks[i].rdev);
@@ -662,8 +660,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
- __release_stripe(conf, sh);
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ release_stripe(sh);
return 0;
}
@@ -696,12 +693,12 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
PRINTK("raid5: error called\n");
if (!test_bit(Faulty, &rdev->flags)) {
- mddev->sb_dirty = 1;
- if (test_bit(In_sync, &rdev->flags)) {
- conf->working_disks--;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
- conf->failed_disks++;
- clear_bit(In_sync, &rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery was running, make sure it aborts.
*/
@@ -711,7 +708,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
printk (KERN_ALERT
"raid5: Disk failure on %s, disabling device."
" Operation continuing on %d devices\n",
- bdevname(rdev->bdev,b), conf->working_disks);
+ bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
}
}
@@ -1108,7 +1105,7 @@ static void compute_parity6(struct stripe_head *sh, int method)
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
- if (sh->dev[i].written) BUG();
+ BUG_ON(sh->dev[i].written);
sh->dev[i].written = chosen;
}
break;
@@ -1353,10 +1350,9 @@ static int page_is_zero(struct page *p)
static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
{
int sectors_per_chunk = conf->chunk_size >> 9;
- sector_t x = stripe;
int pd_idx, dd_idx;
- int chunk_offset = sector_div(x, sectors_per_chunk);
- stripe = x;
+ int chunk_offset = sector_div(stripe, sectors_per_chunk);
+
raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
+ chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
return pd_idx;
@@ -2597,6 +2593,24 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
return ret;
}
+static int raid5_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+
+ /* No difference between reads and writes. Just check
+ * how busy the stripe_cache is
+ */
+ if (conf->inactive_blocked)
+ return 1;
+ if (conf->quiesce)
+ return 1;
+ if (list_empty_careful(&conf->inactive_list))
+ return 1;
+
+ return 0;
+}
+
static int make_request(request_queue_t *q, struct bio * bi)
{
mddev_t *mddev = q->queuedata;
@@ -2781,9 +2795,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes)==0);
mddev->reshape_position = conf->expand_progress;
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
- wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+ wait_event(mddev->sb_wait, mddev->flags == 0 ||
kthread_should_stop());
spin_lock_irq(&conf->device_lock);
conf->expand_lo = mddev->reshape_position;
@@ -3074,6 +3088,7 @@ static int run(mddev_t *mddev)
mdk_rdev_t *rdev;
struct disk_info *disk;
struct list_head *tmp;
+ int working_disks = 0;
if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
@@ -3176,14 +3191,14 @@ static int run(mddev_t *mddev)
printk(KERN_INFO "raid5: device %s operational as raid"
" disk %d\n", bdevname(rdev->bdev,b),
raid_disk);
- conf->working_disks++;
+ working_disks++;
}
}
/*
* 0 for a fully functional array, 1 or 2 for a degraded array.
*/
- mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
+ mddev->degraded = conf->raid_disks - working_disks;
conf->mddev = mddev;
conf->chunk_size = mddev->chunk_size;
conf->level = mddev->level;
@@ -3218,7 +3233,7 @@ static int run(mddev_t *mddev)
if (mddev->degraded > conf->max_degraded) {
printk(KERN_ERR "raid5: not enough operational devices for %s"
" (%d/%d failed)\n",
- mdname(mddev), conf->failed_disks, conf->raid_disks);
+ mdname(mddev), mddev->degraded, conf->raid_disks);
goto abort;
}
@@ -3299,6 +3314,9 @@ static int run(mddev_t *mddev)
mddev->queue->unplug_fn = raid5_unplug_device;
mddev->queue->issue_flush_fn = raid5_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = raid5_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
+
mddev->array_size = mddev->size * (conf->previous_raid_disks -
conf->max_degraded);
@@ -3375,7 +3393,7 @@ static void status (struct seq_file *seq, mddev_t *mddev)
int i;
seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
- seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
+ seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++)
seq_printf (seq, "%s",
conf->disks[i].rdev &&
@@ -3397,8 +3415,8 @@ static void print_raid5_conf (raid5_conf_t *conf)
printk("(conf==NULL)\n");
return;
}
- printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
- conf->working_disks, conf->failed_disks);
+ printk(" --- rd:%d wd:%d\n", conf->raid_disks,
+ conf->raid_disks - conf->mddev->degraded);
for (i = 0; i < conf->raid_disks; i++) {
char b[BDEVNAME_SIZE];
@@ -3420,11 +3438,11 @@ static int raid5_spare_active(mddev_t *mddev)
tmp = conf->disks + i;
if (tmp->rdev
&& !test_bit(Faulty, &tmp->rdev->flags)
- && !test_bit(In_sync, &tmp->rdev->flags)) {
+ && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded--;
- conf->failed_disks--;
- conf->working_disks++;
- set_bit(In_sync, &tmp->rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
}
}
print_raid5_conf(conf);
@@ -3560,6 +3578,7 @@ static int raid5_start_reshape(mddev_t *mddev)
struct list_head *rtmp;
int spares = 0;
int added_devices = 0;
+ unsigned long flags;
if (mddev->degraded ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -3593,7 +3612,6 @@ static int raid5_start_reshape(mddev_t *mddev)
if (raid5_add_disk(mddev, rdev)) {
char nm[20];
set_bit(In_sync, &rdev->flags);
- conf->working_disks++;
added_devices++;
rdev->recovery_offset = 0;
sprintf(nm, "rd%d", rdev->raid_disk);
@@ -3602,10 +3620,12 @@ static int raid5_start_reshape(mddev_t *mddev)
break;
}
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
mddev->raid_disks = conf->raid_disks;
mddev->reshape_position = 0;
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);