From 395c72a707d966b36d5a42fe12c3a237ded3a0d9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Sep 2012 15:34:55 -0700 Subject: block: Generalized bio pool freeing With the old code, when you allocate a bio from a bio pool you have to implement your own destructor that knows how to find the bio pool the bio was originally allocated from. This adds a new field to struct bio (bi_pool) and changes bio_alloc_bioset() to use it. This makes various bio destructors unnecessary, so they're then deleted. v6: Explain the temporary if statement in bio_put Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown CC: Alasdair Kergon CC: Nicholas Bellinger CC: Lars Ellenberg Acked-by: Tejun Heo Acked-by: Nicholas Bellinger Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 7b7ac9ccec7a..af9dd9d2efc4 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -80,6 +80,9 @@ struct bio { struct bio_integrity_payload *bi_integrity; /* data integrity */ #endif + /* If bi_pool is non NULL, bi_destructor is not called */ + struct bio_set *bi_pool; + bio_destructor_t *bi_destructor; /* destructor */ /* -- cgit v1.2.3 From 1e2a410ff71504a64d1af2e354287ac51aeac1b0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Sep 2012 15:34:56 -0700 Subject: block: Ues bi_pool for bio_integrity_alloc() Now that bios keep track of where they were allocated from, bio_integrity_alloc_bioset() becomes redundant. Remove bio_integrity_alloc_bioset() and drop bio_set argument from the related functions and make them use bio->bi_pool. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Martin K. Petersen Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- drivers/md/dm.c | 4 ++-- drivers/md/md.c | 2 +- fs/bio-integrity.c | 44 +++++++++++++++----------------------------- fs/bio.c | 6 +++--- include/linux/bio.h | 9 ++++----- 6 files changed, 26 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/block/blk-core.c b/block/blk-core.c index 4b4dbdfbca89..95c493511be7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2788,7 +2788,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, __bio_clone(bio, bio_src); if (bio_integrity(bio_src) && - bio_integrity_clone(bio, bio_src, gfp_mask, bs)) + bio_integrity_clone(bio, bio_src, gfp_mask)) goto free_and_out; if (bio_ctr && bio_ctr(bio, bio_src, data)) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0c3d6dd51897..f43aaf689bd0 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1068,7 +1068,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector, clone->bi_flags |= 1 << BIO_CLONED; if (bio_integrity(bio)) { - bio_integrity_clone(clone, bio, GFP_NOIO, bs); + bio_integrity_clone(clone, bio, GFP_NOIO); bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len); } @@ -1094,7 +1094,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, clone->bi_flags &= ~(1 << BIO_SEG_VALID); if (bio_integrity(bio)) { - bio_integrity_clone(clone, bio, GFP_NOIO, bs); + bio_integrity_clone(clone, bio, GFP_NOIO); if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) bio_integrity_trim(clone, diff --git a/drivers/md/md.c b/drivers/md/md.c index b8eebe357b2b..457ca8451ddb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -186,7 +186,7 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, if (bio_integrity(bio)) { int ret; - ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set); + ret = bio_integrity_clone(b, bio, gfp_mask); if (ret < 0) { bio_put(b); diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index e85c04b9f61c..a3f28f331b2b 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -70,23 +70,25 @@ static inline int use_bip_pool(unsigned int idx) } /** - * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio + * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to * @gfp_mask: Memory allocation mask * @nr_vecs: Number of integrity metadata scatter-gather elements - * @bs: bio_set to allocate from * * Description: This function prepares a bio for attaching integrity * metadata. nr_vecs specifies the maximum number of pages containing * integrity metadata that can be attached. */ -struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, - gfp_t gfp_mask, - unsigned int nr_vecs, - struct bio_set *bs) +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs) { struct bio_integrity_payload *bip; unsigned int idx = vecs_to_idx(nr_vecs); + struct bio_set *bs = bio->bi_pool; + + if (!bs) + bs = fs_bio_set; BUG_ON(bio == NULL); bip = NULL; @@ -114,37 +116,22 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, return bip; } -EXPORT_SYMBOL(bio_integrity_alloc_bioset); - -/** - * bio_integrity_alloc - Allocate integrity payload and attach it to bio - * @bio: bio to attach integrity metadata to - * @gfp_mask: Memory allocation mask - * @nr_vecs: Number of integrity metadata scatter-gather elements - * - * Description: This function prepares a bio for attaching integrity - * metadata. nr_vecs specifies the maximum number of pages containing - * integrity metadata that can be attached. - */ -struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, - gfp_t gfp_mask, - unsigned int nr_vecs) -{ - return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set); -} EXPORT_SYMBOL(bio_integrity_alloc); /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed - * @bs: bio_set this bio was allocated from * * Description: Used to free the integrity portion of a bio. Usually * called from bio_free(). */ -void bio_integrity_free(struct bio *bio, struct bio_set *bs) +void bio_integrity_free(struct bio *bio) { struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_set *bs = bio->bi_pool; + + if (!bs) + bs = fs_bio_set; BUG_ON(bip == NULL); @@ -730,19 +717,18 @@ EXPORT_SYMBOL(bio_integrity_split); * @bio: New bio * @bio_src: Original bio * @gfp_mask: Memory allocation mask - * @bs: bio_set to allocate bip from * * Description: Called to allocate a bip when cloning a bio */ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - gfp_t gfp_mask, struct bio_set *bs) + gfp_t gfp_mask) { struct bio_integrity_payload *bip_src = bio_src->bi_integrity; struct bio_integrity_payload *bip; BUG_ON(bip_src == NULL); - bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs); + bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); if (bip == NULL) return -EIO; diff --git a/fs/bio.c b/fs/bio.c index e017f7a5cdc6..b14f71adff4a 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -241,7 +241,7 @@ void bio_free(struct bio *bio, struct bio_set *bs) bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); if (bio_integrity(bio)) - bio_integrity_free(bio, bs); + bio_integrity_free(bio); /* * If we have front padding, adjust the bio pointer before freeing @@ -341,7 +341,7 @@ EXPORT_SYMBOL(bio_alloc); static void bio_kmalloc_destructor(struct bio *bio) { if (bio_integrity(bio)) - bio_integrity_free(bio, fs_bio_set); + bio_integrity_free(bio); kfree(bio); } @@ -480,7 +480,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) if (bio_integrity(bio)) { int ret; - ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); + ret = bio_integrity_clone(b, bio, gfp_mask); if (ret < 0) { bio_put(b); diff --git a/include/linux/bio.h b/include/linux/bio.h index 26435890dc87..a11f74bc82d2 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -505,9 +505,8 @@ static inline struct bio *bio_list_get(struct bio_list *bl) #define bio_integrity(bio) (bio->bi_integrity != NULL) -extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *); extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); -extern void bio_integrity_free(struct bio *, struct bio_set *); +extern void bio_integrity_free(struct bio *); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); extern int bio_integrity_enabled(struct bio *bio); extern int bio_integrity_set_tag(struct bio *, void *, unsigned int); @@ -517,7 +516,7 @@ extern void bio_integrity_endio(struct bio *, int); extern void bio_integrity_advance(struct bio *, unsigned int); extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int); extern void bio_integrity_split(struct bio *, struct bio_pair *, int); -extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t, struct bio_set *); +extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t); extern int bioset_integrity_create(struct bio_set *, int); extern void bioset_integrity_free(struct bio_set *); extern void bio_integrity_init(void); @@ -549,13 +548,13 @@ static inline int bio_integrity_prep(struct bio *bio) return 0; } -static inline void bio_integrity_free(struct bio *bio, struct bio_set *bs) +static inline void bio_integrity_free(struct bio *bio) { return; } static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - gfp_t gfp_mask, struct bio_set *bs) + gfp_t gfp_mask) { return 0; } -- cgit v1.2.3 From f44b48c7691be7643877d1f881b5eeace654d05d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Sep 2012 15:34:58 -0700 Subject: block: Add bio_reset() Reusing bios is something that's been highly frowned upon in the past, but driver code keeps doing it anyways. If it's going to happen anyways, we should provide a generic method. This'll help with getting rid of bi_destructor - drivers/block/pktcdvd.c was open coding it, by doing a bio_init() and resetting bi_destructor. This required reordering struct bio, but the block layer is not yet nearly fast enough for any cacheline effects to matter here. v5: Add a define BIO_RESET_BITS, to be very explicit about what parts of bio->bi_flags are saved. v6: Further commenting verbosity, per Tejun v9: Add a function comment Signed-off-by: Kent Overstreet CC: Jens Axboe Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/bio.c | 24 ++++++++++++++++++++++++ include/linux/bio.h | 1 + include/linux/blk_types.h | 25 +++++++++++++++++++------ 3 files changed, 44 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/bio.c b/fs/bio.c index b14f71adff4a..919ee9aa5c57 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -262,6 +262,30 @@ void bio_init(struct bio *bio) } EXPORT_SYMBOL(bio_init); +/** + * bio_reset - reinitialize a bio + * @bio: bio to reset + * + * Description: + * After calling bio_reset(), @bio will be in the same state as a freshly + * allocated bio returned bio bio_alloc_bioset() - the only fields that are + * preserved are the ones that are initialized by bio_alloc_bioset(). See + * comment in struct bio. + */ +void bio_reset(struct bio *bio) +{ + unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); + + if (bio_integrity(bio)) + bio_integrity_free(bio); + + bio_disassociate_task(bio); + + memset(bio, 0, BIO_RESET_BYTES); + bio->bi_flags = flags|(1 << BIO_UPTODATE); +} +EXPORT_SYMBOL(bio_reset); + /** * bio_alloc_bioset - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator diff --git a/include/linux/bio.h b/include/linux/bio.h index a11f74bc82d2..76f6c252baff 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -226,6 +226,7 @@ extern void __bio_clone(struct bio *, struct bio *); extern struct bio *bio_clone(struct bio *, gfp_t); extern void bio_init(struct bio *); +extern void bio_reset(struct bio *); extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index af9dd9d2efc4..1b607c247d72 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -59,12 +59,6 @@ struct bio { unsigned int bi_seg_front_size; unsigned int bi_seg_back_size; - unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ - - atomic_t bi_cnt; /* pin count */ - - struct bio_vec *bi_io_vec; /* the actual vec list */ - bio_end_io_t *bi_end_io; void *bi_private; @@ -80,6 +74,16 @@ struct bio { struct bio_integrity_payload *bi_integrity; /* data integrity */ #endif + /* + * Everything starting with bi_max_vecs will be preserved by bio_reset() + */ + + unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ + + atomic_t bi_cnt; /* pin count */ + + struct bio_vec *bi_io_vec; /* the actual vec list */ + /* If bi_pool is non NULL, bi_destructor is not called */ struct bio_set *bi_pool; @@ -93,6 +97,8 @@ struct bio { struct bio_vec bi_inline_vecs[0]; }; +#define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) + /* * bio flags */ @@ -108,6 +114,13 @@ struct bio { #define BIO_FS_INTEGRITY 9 /* fs owns integrity data, not block layer */ #define BIO_QUIET 10 /* Make BIO Quiet */ #define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */ + +/* + * Flags starting here get preserved by bio_reset() - this includes + * BIO_POOL_IDX() + */ +#define BIO_RESET_BITS 12 + #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) /* -- cgit v1.2.3 From 4254bba17d92d53822a56ebc2a0c1eb7e2a71155 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Sep 2012 15:35:00 -0700 Subject: block: Kill bi_destructor Now that we've got generic code for freeing bios allocated from bio pools, this isn't needed anymore. This patch also makes bio_free() static, since without bi_destructor there should be no need for it to be called anywhere else. bio_free() is now only called from bio_put, so we can refactor those a bit - move some code from bio_put() to bio_free() and kill the redundant bio->bi_next = NULL. v5: Switch to BIO_KMALLOC_POOL ((void *)~0), per Boaz v6: BIO_KMALLOC_POOL now NULL, drop bio_free's EXPORT_SYMBOL v7: No #define BIO_KMALLOC_POOL anymore Signed-off-by: Kent Overstreet CC: Jens Axboe Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 5 ---- block/blk-core.c | 2 +- fs/bio.c | 64 +++++++++++++++++------------------------- include/linux/bio.h | 1 - include/linux/blk_types.h | 3 -- 5 files changed, 27 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index e418dc0a7086..8df5e8e6dceb 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -465,7 +465,6 @@ struct bio { bio_end_io_t *bi_end_io; /* bi_end_io (bio) */ atomic_t bi_cnt; /* pin count: free when it hits zero */ void *bi_private; - bio_destructor_t *bi_destructor; /* bi_destructor (bio) */ }; With this multipage bio design: @@ -647,10 +646,6 @@ for a non-clone bio. There are the 6 pools setup for different size biovecs, so bio_alloc(gfp_mask, nr_iovecs) will allocate a vec_list of the given size from these slabs. -The bi_destructor() routine takes into account the possibility of the bio -having originated from a different source (see later discussions on -n/w to block transfers and kvec_cb) - The bio_get() routine may be used to hold an extra reference on a bio prior to i/o submission, if the bio fields are likely to be accessed after the i/o is issued (since the bio may otherwise get freed in case i/o completion diff --git a/block/blk-core.c b/block/blk-core.c index 95c493511be7..b776cc90a4e7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2807,7 +2807,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, free_and_out: if (bio) - bio_free(bio, bs); + bio_put(bio); blk_rq_unprep_clone(rq); return -ENOMEM; diff --git a/fs/bio.c b/fs/bio.c index 919ee9aa5c57..736ef12f5191 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -233,26 +233,37 @@ fallback: return bvl; } -void bio_free(struct bio *bio, struct bio_set *bs) +static void __bio_free(struct bio *bio) { - void *p; - - if (bio_has_allocated_vec(bio)) - bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); + bio_disassociate_task(bio); if (bio_integrity(bio)) bio_integrity_free(bio); +} - /* - * If we have front padding, adjust the bio pointer before freeing - */ - p = bio; - if (bs->front_pad) +static void bio_free(struct bio *bio) +{ + struct bio_set *bs = bio->bi_pool; + void *p; + + __bio_free(bio); + + if (bs) { + if (bio_has_allocated_vec(bio)) + bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); + + /* + * If we have front padding, adjust the bio pointer before freeing + */ + p = bio; p -= bs->front_pad; - mempool_free(p, bs->bio_pool); + mempool_free(p, bs->bio_pool); + } else { + /* Bio was allocated by bio_kmalloc() */ + kfree(bio); + } } -EXPORT_SYMBOL(bio_free); void bio_init(struct bio *bio) { @@ -276,10 +287,7 @@ void bio_reset(struct bio *bio) { unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - if (bio_integrity(bio)) - bio_integrity_free(bio); - - bio_disassociate_task(bio); + __bio_free(bio); memset(bio, 0, BIO_RESET_BYTES); bio->bi_flags = flags|(1 << BIO_UPTODATE); @@ -362,13 +370,6 @@ struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) } EXPORT_SYMBOL(bio_alloc); -static void bio_kmalloc_destructor(struct bio *bio) -{ - if (bio_integrity(bio)) - bio_integrity_free(bio); - kfree(bio); -} - /** * bio_kmalloc - allocate a bio for I/O using kmalloc() * @gfp_mask: the GFP_ mask given to the slab allocator @@ -395,7 +396,6 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; bio->bi_max_vecs = nr_iovecs; bio->bi_io_vec = bio->bi_inline_vecs; - bio->bi_destructor = bio_kmalloc_destructor; return bio; } @@ -431,20 +431,8 @@ void bio_put(struct bio *bio) /* * last put frees it */ - if (atomic_dec_and_test(&bio->bi_cnt)) { - bio_disassociate_task(bio); - bio->bi_next = NULL; - - /* - * This if statement is temporary - bi_pool is replacing - * bi_destructor, but bi_destructor will be taken out in another - * patch. - */ - if (bio->bi_pool) - bio_free(bio, bio->bi_pool); - else - bio->bi_destructor(bio); - } + if (atomic_dec_and_test(&bio->bi_cnt)) + bio_free(bio); } EXPORT_SYMBOL(bio_put); diff --git a/include/linux/bio.h b/include/linux/bio.h index 76f6c252baff..04944c91fae7 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -216,7 +216,6 @@ extern struct bio *bio_alloc(gfp_t, unsigned int); extern struct bio *bio_kmalloc(gfp_t, unsigned int); extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); extern void bio_put(struct bio *); -extern void bio_free(struct bio *, struct bio_set *); extern void bio_endio(struct bio *, int); struct request_queue; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1b607c247d72..3eefbb291192 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -84,11 +84,8 @@ struct bio { struct bio_vec *bi_io_vec; /* the actual vec list */ - /* If bi_pool is non NULL, bi_destructor is not called */ struct bio_set *bi_pool; - bio_destructor_t *bi_destructor; /* destructor */ - /* * We can inline a number of vecs at the end of the bio, to avoid * double allocations for a small number of bio_vecs. This member -- cgit v1.2.3 From 3f86a82aeb03e6100f7ab39f4702e033a5e38166 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Sep 2012 15:35:01 -0700 Subject: block: Consolidate bio_alloc_bioset(), bio_kmalloc() Previously, bio_kmalloc() and bio_alloc_bioset() behaved slightly different because there was some almost-duplicated code - this fixes some of that. The important change is that previously bio_kmalloc() always set bi_io_vec = bi_inline_vecs, even if nr_iovecs == 0 - unlike bio_alloc_bioset(). This would cause bio_has_data() to return true; I don't know if this resulted in any actual bugs but it was certainly wrong. bio_kmalloc() and bio_alloc_bioset() also have different arbitrary limits on nr_iovecs - 1024 (UIO_MAXIOV) for bio_kmalloc(), 256 (BIO_MAX_PAGES) for bio_alloc_bioset(). This patch doesn't fix that, but at least they're enforced closer together and hopefully they will be fixed in a later patch. This'll also help with some future cleanups - there are a fair number of functions that allocate bios (e.g. bio_clone()), and now they don't have to be duplicated for bio_alloc(), bio_alloc_bioset(), and bio_kmalloc(). Signed-off-by: Kent Overstreet CC: Jens Axboe v7: Re-add dropped comments, improv patch description Signed-off-by: Jens Axboe --- fs/bio.c | 110 ++++++++++++++++++---------------------------------- include/linux/bio.h | 16 ++++++-- 2 files changed, 49 insertions(+), 77 deletions(-) (limited to 'include') diff --git a/fs/bio.c b/fs/bio.c index 736ef12f5191..191b9b86c272 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -55,6 +55,7 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { * IO code that does not need private memory pools. */ struct bio_set *fs_bio_set; +EXPORT_SYMBOL(fs_bio_set); /* * Our slab pool management @@ -301,39 +302,58 @@ EXPORT_SYMBOL(bio_reset); * @bs: the bio_set to allocate from. * * Description: - * bio_alloc_bioset will try its own mempool to satisfy the allocation. - * If %__GFP_WAIT is set then we will block on the internal pool waiting - * for a &struct bio to become free. - **/ + * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is + * backed by the @bs's mempool. + * + * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be + * able to allocate a bio. This is due to the mempool guarantees. To make this + * work, callers must never allocate more than 1 bio at a time from this pool. + * Callers that need to allocate more than 1 bio must always submit the + * previously allocated bio for IO before attempting to allocate a new one. + * Failure to do so can cause deadlocks under memory pressure. + * + * RETURNS: + * Pointer to new bio on success, NULL on failure. + */ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { + unsigned front_pad; + unsigned inline_vecs; unsigned long idx = BIO_POOL_NONE; struct bio_vec *bvl = NULL; struct bio *bio; void *p; - p = mempool_alloc(bs->bio_pool, gfp_mask); + if (!bs) { + if (nr_iovecs > UIO_MAXIOV) + return NULL; + + p = kmalloc(sizeof(struct bio) + + nr_iovecs * sizeof(struct bio_vec), + gfp_mask); + front_pad = 0; + inline_vecs = nr_iovecs; + } else { + p = mempool_alloc(bs->bio_pool, gfp_mask); + front_pad = bs->front_pad; + inline_vecs = BIO_INLINE_VECS; + } + if (unlikely(!p)) return NULL; - bio = p + bs->front_pad; + bio = p + front_pad; bio_init(bio); - bio->bi_pool = bs; - - if (unlikely(!nr_iovecs)) - goto out_set; - if (nr_iovecs <= BIO_INLINE_VECS) { - bvl = bio->bi_inline_vecs; - nr_iovecs = BIO_INLINE_VECS; - } else { + if (nr_iovecs > inline_vecs) { bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); if (unlikely(!bvl)) goto err_free; - - nr_iovecs = bvec_nr_vecs(idx); + } else if (nr_iovecs) { + bvl = bio->bi_inline_vecs; } -out_set: + + bio->bi_pool = bs; bio->bi_flags |= idx << BIO_POOL_OFFSET; bio->bi_max_vecs = nr_iovecs; bio->bi_io_vec = bvl; @@ -345,62 +365,6 @@ err_free: } EXPORT_SYMBOL(bio_alloc_bioset); -/** - * bio_alloc - allocate a new bio, memory pool backed - * @gfp_mask: allocation mask to use - * @nr_iovecs: number of iovecs - * - * bio_alloc will allocate a bio and associated bio_vec array that can hold - * at least @nr_iovecs entries. Allocations will be done from the - * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc. - * - * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate - * a bio. This is due to the mempool guarantees. To make this work, callers - * must never allocate more than 1 bio at a time from this pool. Callers - * that need to allocate more than 1 bio must always submit the previously - * allocated bio for IO before attempting to allocate a new one. Failure to - * do so can cause livelocks under memory pressure. - * - * RETURNS: - * Pointer to new bio on success, NULL on failure. - */ -struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); -} -EXPORT_SYMBOL(bio_alloc); - -/** - * bio_kmalloc - allocate a bio for I/O using kmalloc() - * @gfp_mask: the GFP_ mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate - * - * Description: - * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains - * %__GFP_WAIT, the allocation is guaranteed to succeed. - * - **/ -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) -{ - struct bio *bio; - - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), - gfp_mask); - if (unlikely(!bio)) - return NULL; - - bio_init(bio); - bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; - bio->bi_max_vecs = nr_iovecs; - bio->bi_io_vec = bio->bi_inline_vecs; - - return bio; -} -EXPORT_SYMBOL(bio_kmalloc); - void zero_fill_bio(struct bio *bio) { unsigned long flags; diff --git a/include/linux/bio.h b/include/linux/bio.h index 04944c91fae7..fbe35b175555 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -212,11 +212,21 @@ extern void bio_pair_release(struct bio_pair *dbio); extern struct bio_set *bioset_create(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); -extern struct bio *bio_alloc(gfp_t, unsigned int); -extern struct bio *bio_kmalloc(gfp_t, unsigned int); extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); extern void bio_put(struct bio *); +extern struct bio_set *fs_bio_set; + +static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +{ + return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); +} + +static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) +{ + return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); +} + extern void bio_endio(struct bio *, int); struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); @@ -304,8 +314,6 @@ struct biovec_slab { struct kmem_cache *slab; }; -extern struct bio_set *fs_bio_set; - /* * a small number of entries is fine, not going to be performance critical. * basically we just need to survive -- cgit v1.2.3 From bf800ef1816b4283a885e55ad38068aec9711e4d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Sep 2012 15:35:02 -0700 Subject: block: Add bio_clone_bioset(), bio_clone_kmalloc() Previously, there was bio_clone() but it only allocated from the fs bio set; as a result various users were open coding it and using __bio_clone(). This changes bio_clone() to become bio_clone_bioset(), and then we add bio_clone() and bio_clone_kmalloc() as wrappers around it, making use of the functionality the last patch adedd. This will also help in a later patch changing how bio cloning works. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown CC: Alasdair Kergon CC: Boaz Harrosh CC: Jeff Garzik Acked-by: Jeff Garzik Signed-off-by: Jens Axboe --- block/blk-core.c | 8 +------- drivers/block/osdblk.c | 3 +-- drivers/md/dm-crypt.c | 7 +------ drivers/md/dm.c | 4 ++-- drivers/md/md.c | 20 +------------------- fs/bio.c | 11 +++++++---- fs/exofs/ore.c | 5 ++--- include/linux/bio.h | 17 ++++++++++++++--- 8 files changed, 29 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/block/blk-core.c b/block/blk-core.c index b776cc90a4e7..82aab2815858 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2781,16 +2781,10 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, blk_rq_init(NULL, rq); __rq_for_each_bio(bio_src, rq_src) { - bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs); + bio = bio_clone_bioset(bio_src, gfp_mask, bs); if (!bio) goto free_and_out; - __bio_clone(bio, bio_src); - - if (bio_integrity(bio_src) && - bio_integrity_clone(bio, bio_src, gfp_mask)) - goto free_and_out; - if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index 87311ebac0db..1bbc681688e4 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -266,11 +266,10 @@ static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask) struct bio *tmp, *new_chain = NULL, *tail = NULL; while (old_chain) { - tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); + tmp = bio_clone_kmalloc(old_chain, gfpmask); if (!tmp) goto err_out; - __bio_clone(tmp, old_chain); tmp->bi_bdev = NULL; gfpmask &= ~__GFP_WAIT; tmp->bi_next = NULL; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3c0acba042b6..bbf459bca61d 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -979,19 +979,14 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) * copy the required bvecs because we need the original * one in order to decrypt the whole bio data *afterwards*. */ - clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs); + clone = bio_clone_bioset(base_bio, gfp, cc->bs); if (!clone) return 1; crypt_inc_pending(io); clone_init(io, clone); - clone->bi_idx = 0; - clone->bi_vcnt = bio_segments(base_bio); - clone->bi_size = base_bio->bi_size; clone->bi_sector = cc->start + io->sector; - memcpy(clone->bi_io_vec, bio_iovec(base_bio), - sizeof(struct bio_vec) * clone->bi_vcnt); generic_make_request(clone); return 0; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 33470f01ea5e..837879716889 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1129,8 +1129,8 @@ static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush * and discard, so no need for concern about wasted bvec allocations. */ - clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); - __bio_clone(clone, ci->bio); + clone = bio_clone_bioset(ci->bio, GFP_NOIO, ci->md->bs); + if (len) { clone->bi_sector = ci->sector; clone->bi_size = to_bytes(len); diff --git a/drivers/md/md.c b/drivers/md/md.c index 457ca8451ddb..7a2b0793f66e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -173,28 +173,10 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev); struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, struct mddev *mddev) { - struct bio *b; - if (!mddev || !mddev->bio_set) return bio_clone(bio, gfp_mask); - b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, mddev->bio_set); - if (!b) - return NULL; - - __bio_clone(b, bio); - if (bio_integrity(bio)) { - int ret; - - ret = bio_integrity_clone(b, bio, gfp_mask); - - if (ret < 0) { - bio_put(b); - return NULL; - } - } - - return b; + return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); } EXPORT_SYMBOL_GPL(bio_clone_mddev); diff --git a/fs/bio.c b/fs/bio.c index 191b9b86c272..13e956779e10 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -438,16 +438,19 @@ void __bio_clone(struct bio *bio, struct bio *bio_src) EXPORT_SYMBOL(__bio_clone); /** - * bio_clone - clone a bio + * bio_clone_bioset - clone a bio * @bio: bio to clone * @gfp_mask: allocation priority + * @bs: bio_set to allocate from * * Like __bio_clone, only also allocates the returned bio */ -struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) +struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask, + struct bio_set *bs) { - struct bio *b = bio_alloc(gfp_mask, bio->bi_max_vecs); + struct bio *b; + b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs); if (!b) return NULL; @@ -466,7 +469,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) return b; } -EXPORT_SYMBOL(bio_clone); +EXPORT_SYMBOL(bio_clone_bioset); /** * bio_get_nr_vecs - return approx number of vecs diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 1585db1aa365..f936cb50dc0d 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -814,8 +814,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) struct bio *bio; if (per_dev != master_dev) { - bio = bio_kmalloc(GFP_KERNEL, - master_dev->bio->bi_max_vecs); + bio = bio_clone_kmalloc(master_dev->bio, + GFP_KERNEL); if (unlikely(!bio)) { ORE_DBGMSG( "Failed to allocate BIO size=%u\n", @@ -824,7 +824,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) goto out; } - __bio_clone(bio, master_dev->bio); bio->bi_bdev = NULL; bio->bi_next = NULL; per_dev->offset = master_dev->offset; diff --git a/include/linux/bio.h b/include/linux/bio.h index fbe35b175555..52b9cbc3e4da 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -215,6 +215,9 @@ extern void bioset_free(struct bio_set *); extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); extern void bio_put(struct bio *); +extern void __bio_clone(struct bio *, struct bio *); +extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); + extern struct bio_set *fs_bio_set; static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) @@ -222,18 +225,26 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); } +static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) +{ + return bio_clone_bioset(bio, gfp_mask, fs_bio_set); +} + static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) { return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); } +static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) +{ + return bio_clone_bioset(bio, gfp_mask, NULL); + +} + extern void bio_endio(struct bio *, int); struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); -extern void __bio_clone(struct bio *, struct bio *); -extern struct bio *bio_clone(struct bio *, gfp_t); - extern void bio_init(struct bio *); extern void bio_reset(struct bio *); -- cgit v1.2.3 From e2a60da74fc8215c68509a89e9a69c66363153db Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 18 Sep 2012 12:19:25 -0400 Subject: block: Clean up special command handling logic Remove special-casing of non-rw fs style requests (discard). The nomerge flags are consolidated in blk_types.h, and rq_mergeable() and bio_mergeable() have been modified to use them. bio_is_rw() is used in place of bio_has_data() a few places. This is done to to distinguish true reads and writes from other fs type requests that carry a payload (e.g. write same). Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 13 ++++++------- block/blk-merge.c | 22 +--------------------- block/blk.h | 5 ++--- block/elevator.c | 6 ++---- include/linux/bio.h | 23 +++++++++++++++++++++-- include/linux/blk_types.h | 4 ++++ include/linux/blkdev.h | 22 ++++++++++------------ 7 files changed, 46 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/block/blk-core.c b/block/blk-core.c index 2d739ca10923..5cc29299f6ac 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1657,8 +1657,8 @@ generic_make_request_checks(struct bio *bio) goto end_io; } - if (unlikely(!(bio->bi_rw & REQ_DISCARD) && - nr_sectors > queue_max_hw_sectors(q))) { + if (likely(bio_is_rw(bio) && + nr_sectors > queue_max_hw_sectors(q))) { printk(KERN_ERR "bio too big device %s (%u > %u)\n", bdevname(bio->bi_bdev, b), bio_sectors(bio), @@ -1699,8 +1699,7 @@ generic_make_request_checks(struct bio *bio) if ((bio->bi_rw & REQ_DISCARD) && (!blk_queue_discard(q) || - ((bio->bi_rw & REQ_SECURE) && - !blk_queue_secdiscard(q)))) { + ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) { err = -EOPNOTSUPP; goto end_io; } @@ -1818,7 +1817,7 @@ void submit_bio(int rw, struct bio *bio) * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. */ - if (bio_has_data(bio) && !(rw & REQ_DISCARD)) { + if (bio_has_data(bio)) { if (rw & WRITE) { count_vm_events(PGPGOUT, count); } else { @@ -1864,7 +1863,7 @@ EXPORT_SYMBOL(submit_bio); */ int blk_rq_check_limits(struct request_queue *q, struct request *rq) { - if (rq->cmd_flags & REQ_DISCARD) + if (!rq_mergeable(rq)) return 0; if (blk_rq_sectors(rq) > queue_max_sectors(q) || @@ -2338,7 +2337,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) req->buffer = bio_data(req->bio); /* update sector only for requests with clear definition of sector */ - if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD)) + if (req->cmd_type == REQ_TYPE_FS) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ diff --git a/block/blk-merge.c b/block/blk-merge.c index e76279e41162..86710ca408b8 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -417,18 +417,6 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (!rq_mergeable(req) || !rq_mergeable(next)) return 0; - /* - * Don't merge file system requests and discard requests - */ - if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD)) - return 0; - - /* - * Don't merge discard requests and secure discard requests - */ - if ((req->cmd_flags & REQ_SECURE) != (next->cmd_flags & REQ_SECURE)) - return 0; - /* * not contiguous */ @@ -521,15 +509,7 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq, bool blk_rq_merge_ok(struct request *rq, struct bio *bio) { - if (!rq_mergeable(rq)) - return false; - - /* don't merge file system requests and discard requests */ - if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD)) - return false; - - /* don't merge discard requests and secure discard requests */ - if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE)) + if (!rq_mergeable(rq) || !bio_mergeable(bio)) return false; /* different data direction or already started, don't merge */ diff --git a/block/blk.h b/block/blk.h index 2a0ea32d249f..ca51543b248c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -171,14 +171,13 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) * * a) it's attached to a gendisk, and * b) the queue had IO stats enabled when this request was started, and - * c) it's a file system request or a discard request + * c) it's a file system request */ static inline int blk_do_io_stat(struct request *rq) { return rq->rq_disk && (rq->cmd_flags & REQ_IO_STAT) && - (rq->cmd_type == REQ_TYPE_FS || - (rq->cmd_flags & REQ_DISCARD)); + (rq->cmd_type == REQ_TYPE_FS); } /* diff --git a/block/elevator.c b/block/elevator.c index 6a55d418896f..9b1d42b62f20 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -562,8 +562,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) if (rq->cmd_flags & REQ_SOFTBARRIER) { /* barriers are scheduling boundary, update end_sector */ - if (rq->cmd_type == REQ_TYPE_FS || - (rq->cmd_flags & REQ_DISCARD)) { + if (rq->cmd_type == REQ_TYPE_FS) { q->end_sector = rq_end_sector(rq); q->boundary_rq = rq; } @@ -605,8 +604,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) if (elv_attempt_insert_merge(q, rq)) break; case ELEVATOR_INSERT_SORT: - BUG_ON(rq->cmd_type != REQ_TYPE_FS && - !(rq->cmd_flags & REQ_DISCARD)); + BUG_ON(rq->cmd_type != REQ_TYPE_FS); rq->cmd_flags |= REQ_SORTED; q->nr_sorted++; if (rq_mergeable(rq)) { diff --git a/include/linux/bio.h b/include/linux/bio.h index 52b9cbc3e4da..e54305cacc98 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -386,9 +386,28 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, /* * Check whether this bio carries any data or not. A NULL bio is allowed. */ -static inline int bio_has_data(struct bio *bio) +static inline bool bio_has_data(struct bio *bio) { - return bio && bio->bi_io_vec != NULL; + if (bio && bio->bi_vcnt) + return true; + + return false; +} + +static inline bool bio_is_rw(struct bio *bio) +{ + if (!bio_has_data(bio)) + return false; + + return true; +} + +static inline bool bio_mergeable(struct bio *bio) +{ + if (bio->bi_rw & REQ_NOMERGE_FLAGS) + return false; + + return true; } /* diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3eefbb291192..1b229664f573 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -194,6 +194,10 @@ enum rq_flag_bits { REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | REQ_SECURE) #define REQ_CLONE_MASK REQ_COMMON_MASK +/* This mask is used for both bio and request merge checking */ +#define REQ_NOMERGE_FLAGS \ + (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) + #define REQ_RAHEAD (1 << __REQ_RAHEAD) #define REQ_THROTTLED (1 << __REQ_THROTTLED) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4a2ab7c85393..3a6fea7460f1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -540,8 +540,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_account_rq(rq) \ (((rq)->cmd_flags & REQ_STARTED) && \ - ((rq)->cmd_type == REQ_TYPE_FS || \ - ((rq)->cmd_flags & REQ_DISCARD))) + ((rq)->cmd_type == REQ_TYPE_FS)) #define blk_pm_request(rq) \ ((rq)->cmd_type == REQ_TYPE_PM_SUSPEND || \ @@ -595,17 +594,16 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync) rl->flags &= ~flag; } +static inline bool rq_mergeable(struct request *rq) +{ + if (rq->cmd_type != REQ_TYPE_FS) + return false; -/* - * mergeable request must not have _NOMERGE or _BARRIER bit set, nor may - * it already be started by driver. - */ -#define RQ_NOMERGE_FLAGS \ - (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_DISCARD) -#define rq_mergeable(rq) \ - (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ - (((rq)->cmd_flags & REQ_DISCARD) || \ - (rq)->cmd_type == REQ_TYPE_FS)) + if (rq->cmd_flags & REQ_NOMERGE_FLAGS) + return false; + + return true; +} /* * q->prep_rq_fn return values -- cgit v1.2.3 From f31dc1cd490539e2b62a126bc4dc2495b165d772 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 18 Sep 2012 12:19:26 -0400 Subject: block: Consolidate command flag and queue limit checks for merges - blk_check_merge_flags() verifies that cmd_flags / bi_rw are compatible. This function is called for both req-req and req-bio merging. - blk_rq_get_max_sectors() and blk_queue_get_max_sectors() can be used to query the maximum sector count for a given request or queue. The calls will return the right value from the queue limits given the type of command (RW, discard, write same, etc.) Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +-- block/blk-merge.c | 30 ++++++++++++------------------ include/linux/blkdev.h | 31 +++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/block/blk-core.c b/block/blk-core.c index 5cc29299f6ac..33eded00c5b1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1866,8 +1866,7 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq) if (!rq_mergeable(rq)) return 0; - if (blk_rq_sectors(rq) > queue_max_sectors(q) || - blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) { + if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; } diff --git a/block/blk-merge.c b/block/blk-merge.c index 86710ca408b8..642b862608a1 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -275,14 +275,8 @@ no_merge: int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { - unsigned short max_sectors; - - if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC)) - max_sectors = queue_max_hw_sectors(q); - else - max_sectors = queue_max_sectors(q); - - if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) { + if (blk_rq_sectors(req) + bio_sectors(bio) > + blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -299,15 +293,8 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { - unsigned short max_sectors; - - if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC)) - max_sectors = queue_max_hw_sectors(q); - else - max_sectors = queue_max_sectors(q); - - - if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) { + if (blk_rq_sectors(req) + bio_sectors(bio) > + blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -338,7 +325,8 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, /* * Will it become too large? */ - if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > queue_max_sectors(q)) + if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > + blk_rq_get_max_sectors(req)) return 0; total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; @@ -417,6 +405,9 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (!rq_mergeable(req) || !rq_mergeable(next)) return 0; + if (!blk_check_merge_flags(req->cmd_flags, next->cmd_flags)) + return 0; + /* * not contiguous */ @@ -512,6 +503,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (!rq_mergeable(rq) || !bio_mergeable(bio)) return false; + if (!blk_check_merge_flags(rq->cmd_flags, bio->bi_rw)) + return false; + /* different data direction or already started, don't merge */ if (bio_data_dir(bio) != rq_data_dir(rq)) return false; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3a6fea7460f1..90f7abe8f183 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -605,6 +605,18 @@ static inline bool rq_mergeable(struct request *rq) return true; } +static inline bool blk_check_merge_flags(unsigned int flags1, + unsigned int flags2) +{ + if ((flags1 & REQ_DISCARD) != (flags2 & REQ_DISCARD)) + return false; + + if ((flags1 & REQ_SECURE) != (flags2 & REQ_SECURE)) + return false; + + return true; +} + /* * q->prep_rq_fn return values */ @@ -800,6 +812,25 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> 9; } +static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, + unsigned int cmd_flags) +{ + if (unlikely(cmd_flags & REQ_DISCARD)) + return q->limits.max_discard_sectors; + + return q->limits.max_sectors; +} + +static inline unsigned int blk_rq_get_max_sectors(struct request *rq) +{ + struct request_queue *q = rq->q; + + if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC)) + return q->limits.max_hw_sectors; + + return blk_queue_get_max_sectors(q, rq->cmd_flags); +} + /* * Request issue related functions. */ -- cgit v1.2.3 From 4363ac7c13a9a4b763c6e8d9fdbfc2468f3b8ca4 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 18 Sep 2012 12:19:27 -0400 Subject: block: Implement support for WRITE SAME The WRITE SAME command supported on some SCSI devices allows the same block to be efficiently replicated throughout a block range. Only a single logical block is transferred from the host and the storage device writes the same data to all blocks described by the I/O. This patch implements support for WRITE SAME in the block layer. The blkdev_issue_write_same() function can be used by filesystems and block drivers to replicate a buffer across a block range. This can be used to efficiently initialize software RAID devices, etc. Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block | 14 +++++++ block/blk-core.c | 14 ++++++- block/blk-lib.c | 74 +++++++++++++++++++++++++++++++++++ block/blk-merge.c | 9 +++++ block/blk-settings.c | 16 ++++++++ block/blk-sysfs.c | 13 ++++++ drivers/md/raid0.c | 1 + fs/bio.c | 9 +++-- include/linux/bio.h | 3 ++ include/linux/blk_types.h | 5 ++- include/linux/blkdev.h | 29 ++++++++++++++ 11 files changed, 181 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index c1eb41cb9876..279da08f7541 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -206,3 +206,17 @@ Description: when a discarded area is read the discard_zeroes_data parameter will be set to one. Otherwise it will be 0 and the result of reading a discarded area is undefined. + +What: /sys/block//queue/write_same_max_bytes +Date: January 2012 +Contact: Martin K. Petersen +Description: + Some devices support a write same operation in which a + single data block can be written to a range of several + contiguous blocks on storage. This can be used to wipe + areas on disk or to initialize drives in a RAID + configuration. write_same_max_bytes indicates how many + bytes can be written in a single write same command. If + write_same_max_bytes is 0, write same is not supported + by the device. + diff --git a/block/blk-core.c b/block/blk-core.c index 33eded00c5b1..3b080541098e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1704,6 +1704,11 @@ generic_make_request_checks(struct bio *bio) goto end_io; } + if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { + err = -EOPNOTSUPP; + goto end_io; + } + /* * Various block parts want %current->io_context and lazy ioc * allocation ends up trading a lot of pain for a small amount of @@ -1809,8 +1814,6 @@ EXPORT_SYMBOL(generic_make_request); */ void submit_bio(int rw, struct bio *bio) { - int count = bio_sectors(bio); - bio->bi_rw |= rw; /* @@ -1818,6 +1821,13 @@ void submit_bio(int rw, struct bio *bio) * go through the normal accounting stuff before submission. */ if (bio_has_data(bio)) { + unsigned int count; + + if (unlikely(rw & REQ_WRITE_SAME)) + count = bdev_logical_block_size(bio->bi_bdev) >> 9; + else + count = bio_sectors(bio); + if (rw & WRITE) { count_vm_events(PGPGOUT, count); } else { diff --git a/block/blk-lib.c b/block/blk-lib.c index 19cc761cacb2..a062543c58ac 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -129,6 +129,80 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL(blkdev_issue_discard); +/** + * blkdev_issue_write_same - queue a write same operation + * @bdev: target blockdev + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @page: page containing data to write + * + * Description: + * Issue a write same request for the sectors in question. + */ +int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, + struct page *page) +{ + DECLARE_COMPLETION_ONSTACK(wait); + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_write_same_sectors; + struct bio_batch bb; + struct bio *bio; + int ret = 0; + + if (!q) + return -ENXIO; + + max_write_same_sectors = q->limits.max_write_same_sectors; + + if (max_write_same_sectors == 0) + return -EOPNOTSUPP; + + atomic_set(&bb.done, 1); + bb.flags = 1 << BIO_UPTODATE; + bb.wait = &wait; + + while (nr_sects) { + bio = bio_alloc(gfp_mask, 1); + if (!bio) { + ret = -ENOMEM; + break; + } + + bio->bi_sector = sector; + bio->bi_end_io = bio_batch_end_io; + bio->bi_bdev = bdev; + bio->bi_private = &bb; + bio->bi_vcnt = 1; + bio->bi_io_vec->bv_page = page; + bio->bi_io_vec->bv_offset = 0; + bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev); + + if (nr_sects > max_write_same_sectors) { + bio->bi_size = max_write_same_sectors << 9; + nr_sects -= max_write_same_sectors; + sector += max_write_same_sectors; + } else { + bio->bi_size = nr_sects << 9; + nr_sects = 0; + } + + atomic_inc(&bb.done); + submit_bio(REQ_WRITE | REQ_WRITE_SAME, bio); + } + + /* Wait for bios in-flight */ + if (!atomic_dec_and_test(&bb.done)) + wait_for_completion(&wait); + + if (!test_bit(BIO_UPTODATE, &bb.flags)) + ret = -ENOTSUPP; + + return ret; +} +EXPORT_SYMBOL(blkdev_issue_write_same); + /** * blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue diff --git a/block/blk-merge.c b/block/blk-merge.c index 642b862608a1..936a110de0b9 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -419,6 +419,10 @@ static int attempt_merge(struct request_queue *q, struct request *req, || next->special) return 0; + if (req->cmd_flags & REQ_WRITE_SAME && + !blk_write_same_mergeable(req->bio, next->bio)) + return 0; + /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn @@ -518,6 +522,11 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (bio_integrity(bio) != blk_integrity_rq(rq)) return false; + /* must be using the same buffer */ + if (rq->cmd_flags & REQ_WRITE_SAME && + !blk_write_same_mergeable(rq->bio, bio)) + return false; + return true; } diff --git a/block/blk-settings.c b/block/blk-settings.c index 565a6786032f..779bb7646bcd 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -113,6 +113,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->max_write_same_sectors = 0; lim->max_discard_sectors = 0; lim->discard_granularity = 0; lim->discard_alignment = 0; @@ -144,6 +145,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_segments = USHRT_MAX; lim->max_hw_sectors = UINT_MAX; lim->max_sectors = UINT_MAX; + lim->max_write_same_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -285,6 +287,18 @@ void blk_queue_max_discard_sectors(struct request_queue *q, } EXPORT_SYMBOL(blk_queue_max_discard_sectors); +/** + * blk_queue_max_write_same_sectors - set max sectors for a single write same + * @q: the request queue for the device + * @max_write_same_sectors: maximum number of sectors to write per command + **/ +void blk_queue_max_write_same_sectors(struct request_queue *q, + unsigned int max_write_same_sectors) +{ + q->limits.max_write_same_sectors = max_write_same_sectors; +} +EXPORT_SYMBOL(blk_queue_max_write_same_sectors); + /** * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device @@ -510,6 +524,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); + t->max_write_same_sectors = min(t->max_write_same_sectors, + b->max_write_same_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index ea51d827a0bb..247dbfd42621 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -180,6 +180,13 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag return queue_var_show(queue_discard_zeroes_data(q), page); } +static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_write_same_sectors << 9); +} + + static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { @@ -385,6 +392,11 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { .show = queue_discard_zeroes_data_show, }; +static struct queue_sysfs_entry queue_write_same_max_entry = { + .attr = {.name = "write_same_max_bytes", .mode = S_IRUGO }, + .show = queue_write_same_max_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, .show = queue_show_nonrot, @@ -432,6 +444,7 @@ static struct attribute *default_attrs[] = { &queue_discard_granularity_entry.attr, &queue_discard_max_entry.attr, &queue_discard_zeroes_data_entry.attr, + &queue_write_same_max_entry.attr, &queue_nonrot_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index de63a1fc3737..a9e4fa95dfaa 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -422,6 +422,7 @@ static int raid0_run(struct mddev *mddev) if (md_check_no_bitmap(mddev)) return -EINVAL; blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { diff --git a/fs/bio.c b/fs/bio.c index 13e956779e10..f855e0e1869c 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1487,9 +1487,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) bp->bv1 = bi->bi_io_vec[0]; bp->bv2 = bi->bi_io_vec[0]; - bp->bv2.bv_offset += first_sectors << 9; - bp->bv2.bv_len -= first_sectors << 9; - bp->bv1.bv_len = first_sectors << 9; + + if (bio_is_rw(bi)) { + bp->bv2.bv_offset += first_sectors << 9; + bp->bv2.bv_len -= first_sectors << 9; + bp->bv1.bv_len = first_sectors << 9; + } bp->bio1.bi_io_vec = &bp->bv1; bp->bio2.bi_io_vec = &bp->bv2; diff --git a/include/linux/bio.h b/include/linux/bio.h index e54305cacc98..820e7aaad4fd 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -399,6 +399,9 @@ static inline bool bio_is_rw(struct bio *bio) if (!bio_has_data(bio)) return false; + if (bio->bi_rw & REQ_WRITE_SAME) + return false; + return true; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1b229664f573..cdf11191e645 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -147,6 +147,7 @@ enum rq_flag_bits { __REQ_PRIO, /* boost priority in cfq */ __REQ_DISCARD, /* request to discard sectors */ __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */ + __REQ_WRITE_SAME, /* write same block many times */ __REQ_NOIDLE, /* don't anticipate more IO after this one */ __REQ_FUA, /* forced unit access */ @@ -185,13 +186,15 @@ enum rq_flag_bits { #define REQ_META (1 << __REQ_META) #define REQ_PRIO (1 << __REQ_PRIO) #define REQ_DISCARD (1 << __REQ_DISCARD) +#define REQ_WRITE_SAME (1 << __REQ_WRITE_SAME) #define REQ_NOIDLE (1 << __REQ_NOIDLE) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_COMMON_MASK \ (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \ - REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | REQ_SECURE) + REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \ + REQ_SECURE) #define REQ_CLONE_MASK REQ_COMMON_MASK /* This mask is used for both bio and request merge checking */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 90f7abe8f183..1756001210d2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -270,6 +270,7 @@ struct queue_limits { unsigned int io_min; unsigned int io_opt; unsigned int max_discard_sectors; + unsigned int max_write_same_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -614,9 +615,20 @@ static inline bool blk_check_merge_flags(unsigned int flags1, if ((flags1 & REQ_SECURE) != (flags2 & REQ_SECURE)) return false; + if ((flags1 & REQ_WRITE_SAME) != (flags2 & REQ_WRITE_SAME)) + return false; + return true; } +static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) +{ + if (bio_data(a) == bio_data(b)) + return true; + + return false; +} + /* * q->prep_rq_fn return values */ @@ -818,6 +830,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, if (unlikely(cmd_flags & REQ_DISCARD)) return q->limits.max_discard_sectors; + if (unlikely(cmd_flags & REQ_WRITE_SAME)) + return q->limits.max_write_same_sectors; + return q->limits.max_sectors; } @@ -886,6 +901,8 @@ extern void blk_queue_max_segments(struct request_queue *, unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); +extern void blk_queue_max_write_same_sectors(struct request_queue *q, + unsigned int max_write_same_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, @@ -1016,6 +1033,8 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); +extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct page *page); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask); static inline int sb_issue_discard(struct super_block *sb, sector_t block, @@ -1193,6 +1212,16 @@ static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev) return queue_discard_zeroes_data(bdev_get_queue(bdev)); } +static inline unsigned int bdev_write_same(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return q->limits.max_write_same_sectors; + + return 0; +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; -- cgit v1.2.3 From 66ba32dc167202c3cf8c86806581a9393ec7f488 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 18 Sep 2012 12:19:29 -0400 Subject: block: ioctl to zero block ranges Introduce a BLKZEROOUT ioctl which can be used to clear block ranges by way of blkdev_issue_zeroout(). Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/ioctl.c | 27 +++++++++++++++++++++++++++ include/linux/fs.h | 1 + 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/block/ioctl.c b/block/ioctl.c index 4476e0e85d16..769d2960c0a6 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -185,6 +185,22 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); } +static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start, + uint64_t len) +{ + if (start & 511) + return -EINVAL; + if (len & 511) + return -EINVAL; + start >>= 9; + len >>= 9; + + if (start + len > (i_size_read(bdev->bd_inode) >> 9)) + return -EINVAL; + + return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL); +} + static int put_ushort(unsigned long arg, unsigned short val) { return put_user(val, (unsigned short __user *)arg); @@ -300,6 +316,17 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return blk_ioctl_discard(bdev, range[0], range[1], cmd == BLKSECDISCARD); } + case BLKZEROOUT: { + uint64_t range[2]; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(range, (void __user *)arg, sizeof(range))) + return -EFAULT; + + return blk_ioctl_zeroout(bdev, range[0], range[1]); + } case HDIO_GETGEO: { struct hd_geometry geo; diff --git a/include/linux/fs.h b/include/linux/fs.h index aa110476a95b..bd6f6e7ca48e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -335,6 +335,7 @@ struct inodes_stat_t { #define BLKDISCARDZEROES _IO(0x12,124) #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) +#define BLKZEROOUT _IO(0x12,127) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit v1.2.3 From b87570f5d349661814b262dd5fc40787700f80d6 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 26 Sep 2012 07:46:40 +0200 Subject: Fix a crash when block device is read and block size is changed at the same time The kernel may crash when block size is changed and I/O is issued simultaneously. Because some subsystems (udev or lvm) may read any block device anytime, the bug actually puts any code that changes a block device size in jeopardy. The crash can be reproduced if you place "msleep(1000)" to blkdev_get_blocks just before "bh->b_size = max_blocks << inode->i_blkbits;". Then, run "dd if=/dev/ram0 of=/dev/null bs=4k count=1 iflag=direct" While it is waiting in msleep, run "blockdev --setbsz 2048 /dev/ram0" You get a BUG. The direct and non-direct I/O is written with the assumption that block size does not change. It doesn't seem practical to fix these crashes one-by-one there may be many crash possibilities when block size changes at a certain place and it is impossible to find them all and verify the code. This patch introduces a new rw-lock bd_block_size_semaphore. The lock is taken for read during I/O. It is taken for write when changing block size. Consequently, block size can't be changed while I/O is being submitted. For asynchronous I/O, the patch only prevents block size change while the I/O is being submitted. The block size can change when the I/O is in progress or when the I/O is being finished. This is acceptable because there are no accesses to block size when asynchronous I/O is being finished. The patch prevents block size changing while the device is mapped with mmap. Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- drivers/char/raw.c | 2 +- fs/block_dev.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/fs.h | 4 ++++ 3 files changed, 65 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 54a3a6d09819..0bb207eaef2f 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -285,7 +285,7 @@ static long raw_ctl_compat_ioctl(struct file *file, unsigned int cmd, static const struct file_operations raw_fops = { .read = do_sync_read, - .aio_read = generic_file_aio_read, + .aio_read = blkdev_aio_read, .write = do_sync_write, .aio_write = blkdev_aio_write, .fsync = blkdev_fsync, diff --git a/fs/block_dev.c b/fs/block_dev.c index 38e721b35d45..cdfb625824e2 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -116,6 +116,8 @@ EXPORT_SYMBOL(invalidate_bdev); int set_blocksize(struct block_device *bdev, int size) { + struct address_space *mapping; + /* Size must be a power of two, and between 512 and PAGE_SIZE */ if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) return -EINVAL; @@ -124,6 +126,20 @@ int set_blocksize(struct block_device *bdev, int size) if (size < bdev_logical_block_size(bdev)) return -EINVAL; + /* Prevent starting I/O or mapping the device */ + down_write(&bdev->bd_block_size_semaphore); + + /* Check that the block device is not memory mapped */ + mapping = bdev->bd_inode->i_mapping; + mutex_lock(&mapping->i_mmap_mutex); + if (!prio_tree_empty(&mapping->i_mmap) || + !list_empty(&mapping->i_mmap_nonlinear)) { + mutex_unlock(&mapping->i_mmap_mutex); + up_write(&bdev->bd_block_size_semaphore); + return -EBUSY; + } + mutex_unlock(&mapping->i_mmap_mutex); + /* Don't change the size if it is same as current */ if (bdev->bd_block_size != size) { sync_blockdev(bdev); @@ -131,6 +147,9 @@ int set_blocksize(struct block_device *bdev, int size) bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); } + + up_write(&bdev->bd_block_size_semaphore); + return 0; } @@ -472,6 +491,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); + init_rwsem(&bdev->bd_block_size_semaphore); } static inline void __bd_forget(struct inode *inode) @@ -1567,6 +1587,22 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) return blkdev_ioctl(bdev, mode, cmd, arg); } +ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t ret; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + + down_read(&bdev->bd_block_size_semaphore); + + ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + + up_read(&bdev->bd_block_size_semaphore); + + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_aio_read); + /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -1578,12 +1614,16 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; + struct block_device *bdev = I_BDEV(file->f_mapping->host); struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); blk_start_plug(&plug); + + down_read(&bdev->bd_block_size_semaphore); + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; @@ -1592,11 +1632,29 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } + + up_read(&bdev->bd_block_size_semaphore); + blk_finish_plug(&plug); + return ret; } EXPORT_SYMBOL_GPL(blkdev_aio_write); +int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret; + struct block_device *bdev = I_BDEV(file->f_mapping->host); + + down_read(&bdev->bd_block_size_semaphore); + + ret = generic_file_mmap(file, vma); + + up_read(&bdev->bd_block_size_semaphore); + + return ret; +} + /* * Try to release a page associated with block device when the system * is under memory pressure. @@ -1627,9 +1685,9 @@ const struct file_operations def_blk_fops = { .llseek = block_llseek, .read = do_sync_read, .write = do_sync_write, - .aio_read = generic_file_aio_read, + .aio_read = blkdev_aio_read, .aio_write = blkdev_aio_write, - .mmap = generic_file_mmap, + .mmap = blkdev_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT diff --git a/include/linux/fs.h b/include/linux/fs.h index bd6f6e7ca48e..e60bbd0225d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -725,6 +725,8 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; + /* A semaphore that prevents I/O while block size is being changed */ + struct rw_semaphore bd_block_size_semaphore; }; /* @@ -2565,6 +2567,8 @@ extern int generic_segment_checks(const struct iovec *iov, unsigned long *nr_segs, size_t *count, int access_flags); /* fs/block_dev.c */ +extern ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, -- cgit v1.2.3 From 62ac665ff9fc07497ca524bd20d6a96893d11071 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 26 Sep 2012 07:46:43 +0200 Subject: blockdev: turn a rw semaphore into a percpu rw semaphore This avoids cache line bouncing when many processes lock the semaphore for read. New percpu lock implementation The lock consists of an array of percpu unsigned integers, a boolean variable and a mutex. When we take the lock for read, we enter rcu read section, check for a "locked" variable. If it is false, we increase a percpu counter on the current cpu and exit the rcu section. If "locked" is true, we exit the rcu section, take the mutex and drop it (this waits until a writer finished) and retry. Unlocking for read just decreases percpu variable. Note that we can unlock on a difference cpu than where we locked, in this case the counter underflows. The sum of all percpu counters represents the number of processes that hold the lock for read. When we need to lock for write, we take the mutex, set "locked" variable to true and synchronize rcu. Since RCU has been synchronized, no processes can create new read locks. We wait until the sum of percpu counters is zero - when it is, there are no readers in the critical section. Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- Documentation/percpu-rw-semaphore.txt | 27 +++++++++++ fs/block_dev.c | 27 +++++++---- include/linux/fs.h | 3 +- include/linux/percpu-rwsem.h | 89 +++++++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 11 deletions(-) create mode 100644 Documentation/percpu-rw-semaphore.txt create mode 100644 include/linux/percpu-rwsem.h (limited to 'include') diff --git a/Documentation/percpu-rw-semaphore.txt b/Documentation/percpu-rw-semaphore.txt new file mode 100644 index 000000000000..eddd77094725 --- /dev/null +++ b/Documentation/percpu-rw-semaphore.txt @@ -0,0 +1,27 @@ +Percpu rw semaphores +-------------------- + +Percpu rw semaphores is a new read-write semaphore design that is +optimized for locking for reading. + +The problem with traditional read-write semaphores is that when multiple +cores take the lock for reading, the cache line containing the semaphore +is bouncing between L1 caches of the cores, causing performance +degradation. + +Locking for reading it very fast, it uses RCU and it avoids any atomic +instruction in the lock and unlock path. On the other hand, locking for +writing is very expensive, it calls synchronize_rcu() that can take +hundreds of microseconds. + +The lock is declared with "struct percpu_rw_semaphore" type. +The lock is initialized percpu_init_rwsem, it returns 0 on success and +-ENOMEM on allocation failure. +The lock must be freed with percpu_free_rwsem to avoid memory leak. + +The lock is locked for read with percpu_down_read, percpu_up_read and +for write with percpu_down_write, percpu_up_write. + +The idea of using RCU for optimized rw-lock was introduced by +Eric Dumazet . +The code was written by Mikulas Patocka diff --git a/fs/block_dev.c b/fs/block_dev.c index cdfb625824e2..7eeb0635338b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -127,7 +127,7 @@ int set_blocksize(struct block_device *bdev, int size) return -EINVAL; /* Prevent starting I/O or mapping the device */ - down_write(&bdev->bd_block_size_semaphore); + percpu_down_write(&bdev->bd_block_size_semaphore); /* Check that the block device is not memory mapped */ mapping = bdev->bd_inode->i_mapping; @@ -135,7 +135,7 @@ int set_blocksize(struct block_device *bdev, int size) if (!prio_tree_empty(&mapping->i_mmap) || !list_empty(&mapping->i_mmap_nonlinear)) { mutex_unlock(&mapping->i_mmap_mutex); - up_write(&bdev->bd_block_size_semaphore); + percpu_up_write(&bdev->bd_block_size_semaphore); return -EBUSY; } mutex_unlock(&mapping->i_mmap_mutex); @@ -148,7 +148,7 @@ int set_blocksize(struct block_device *bdev, int size) kill_bdev(bdev); } - up_write(&bdev->bd_block_size_semaphore); + percpu_up_write(&bdev->bd_block_size_semaphore); return 0; } @@ -460,6 +460,12 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); if (!ei) return NULL; + + if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) { + kmem_cache_free(bdev_cachep, ei); + return NULL; + } + return &ei->vfs_inode; } @@ -468,6 +474,8 @@ static void bdev_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct bdev_inode *bdi = BDEV_I(inode); + percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore); + kmem_cache_free(bdev_cachep, bdi); } @@ -491,7 +499,6 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); - init_rwsem(&bdev->bd_block_size_semaphore); } static inline void __bd_forget(struct inode *inode) @@ -1593,11 +1600,11 @@ ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, ssize_t ret; struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); - down_read(&bdev->bd_block_size_semaphore); + percpu_down_read(&bdev->bd_block_size_semaphore); ret = generic_file_aio_read(iocb, iov, nr_segs, pos); - up_read(&bdev->bd_block_size_semaphore); + percpu_up_read(&bdev->bd_block_size_semaphore); return ret; } @@ -1622,7 +1629,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, blk_start_plug(&plug); - down_read(&bdev->bd_block_size_semaphore); + percpu_down_read(&bdev->bd_block_size_semaphore); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); if (ret > 0 || ret == -EIOCBQUEUED) { @@ -1633,7 +1640,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, ret = err; } - up_read(&bdev->bd_block_size_semaphore); + percpu_up_read(&bdev->bd_block_size_semaphore); blk_finish_plug(&plug); @@ -1646,11 +1653,11 @@ int blkdev_mmap(struct file *file, struct vm_area_struct *vma) int ret; struct block_device *bdev = I_BDEV(file->f_mapping->host); - down_read(&bdev->bd_block_size_semaphore); + percpu_down_read(&bdev->bd_block_size_semaphore); ret = generic_file_mmap(file, vma); - up_read(&bdev->bd_block_size_semaphore); + percpu_up_read(&bdev->bd_block_size_semaphore); return ret; } diff --git a/include/linux/fs.h b/include/linux/fs.h index e60bbd0225d5..24e1229cdfe0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -10,6 +10,7 @@ #include #include #include +#include /* * It's silly to have NR_OPEN bigger than NR_FILE, but you can change @@ -726,7 +727,7 @@ struct block_device { /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; /* A semaphore that prevents I/O while block size is being changed */ - struct rw_semaphore bd_block_size_semaphore; + struct percpu_rw_semaphore bd_block_size_semaphore; }; /* diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h new file mode 100644 index 000000000000..cf80f7e5277f --- /dev/null +++ b/include/linux/percpu-rwsem.h @@ -0,0 +1,89 @@ +#ifndef _LINUX_PERCPU_RWSEM_H +#define _LINUX_PERCPU_RWSEM_H + +#include +#include +#include +#include + +struct percpu_rw_semaphore { + unsigned __percpu *counters; + bool locked; + struct mutex mtx; +}; + +static inline void percpu_down_read(struct percpu_rw_semaphore *p) +{ + rcu_read_lock(); + if (unlikely(p->locked)) { + rcu_read_unlock(); + mutex_lock(&p->mtx); + this_cpu_inc(*p->counters); + mutex_unlock(&p->mtx); + return; + } + this_cpu_inc(*p->counters); + rcu_read_unlock(); +} + +static inline void percpu_up_read(struct percpu_rw_semaphore *p) +{ + /* + * On X86, write operation in this_cpu_dec serves as a memory unlock + * barrier (i.e. memory accesses may be moved before the write, but + * no memory accesses are moved past the write). + * On other architectures this may not be the case, so we need smp_mb() + * there. + */ +#if defined(CONFIG_X86) && (!defined(CONFIG_X86_PPRO_FENCE) && !defined(CONFIG_X86_OOSTORE)) + barrier(); +#else + smp_mb(); +#endif + this_cpu_dec(*p->counters); +} + +static inline unsigned __percpu_count(unsigned __percpu *counters) +{ + unsigned total = 0; + int cpu; + + for_each_possible_cpu(cpu) + total += ACCESS_ONCE(*per_cpu_ptr(counters, cpu)); + + return total; +} + +static inline void percpu_down_write(struct percpu_rw_semaphore *p) +{ + mutex_lock(&p->mtx); + p->locked = true; + synchronize_rcu(); + while (__percpu_count(p->counters)) + msleep(1); + smp_rmb(); /* paired with smp_mb() in percpu_sem_up_read() */ +} + +static inline void percpu_up_write(struct percpu_rw_semaphore *p) +{ + p->locked = false; + mutex_unlock(&p->mtx); +} + +static inline int percpu_init_rwsem(struct percpu_rw_semaphore *p) +{ + p->counters = alloc_percpu(unsigned); + if (unlikely(!p->counters)) + return -ENOMEM; + p->locked = false; + mutex_init(&p->mtx); + return 0; +} + +static inline void percpu_free_rwsem(struct percpu_rw_semaphore *p) +{ + free_percpu(p->counters); + p->counters = NULL; /* catch use after free bugs */ +} + +#endif -- cgit v1.2.3 From c2b1ad800b66f62105a7fd250604d72e07202e66 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Sep 2012 09:35:03 +0200 Subject: fs: fix include/percpu-rwsem.h export error We get the following export error on the include file: usr/include/linux/fs.h:13: included file 'linux/percpu-rwsem.h' is not exported Move the include inside the __KERNEL__ section. Reported-by: Stephen Rothwell Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 24e1229cdfe0..cefa9645fc85 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -10,7 +10,6 @@ #include #include #include -#include /* * It's silly to have NR_OPEN bigger than NR_FILE, but you can change @@ -417,6 +416,7 @@ struct inodes_stat_t { #include #include #include +#include #include -- cgit v1.2.3 From 2e484610296b25f0a04b516bc144a00731d1d845 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 27 Sep 2012 12:45:28 +0200 Subject: scatterlist: add sg_nents Useful helper to know the number of entries in scatterlist. Signed-off-by: Maxim Levitsky Cc: Alex Dubov Acked-by: Tejun Heo Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/scatterlist.h | 1 + lib/scatterlist.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 7b600da9a635..4bd6c06eb28e 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -201,6 +201,7 @@ static inline void *sg_virt(struct scatterlist *sg) return page_address(sg_page(sg)) + sg->offset; } +int sg_nents(struct scatterlist *sg); struct scatterlist *sg_next(struct scatterlist *); struct scatterlist *sg_last(struct scatterlist *s, unsigned int); void sg_init_table(struct scatterlist *, unsigned int); diff --git a/lib/scatterlist.c b/lib/scatterlist.c index fadae774a20c..1bf60efb5e02 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -38,6 +38,28 @@ struct scatterlist *sg_next(struct scatterlist *sg) } EXPORT_SYMBOL(sg_next); +/** + * sg_nents - return total count of entries in scatterlist + * @sg: The scatterlist + * + * Description: + * Allows to know how many entries are in sg, taking into acount + * chaining as well + * + **/ +int sg_nents(struct scatterlist *sg) +{ + int nents = 0; + while (sg) { + nents++; + sg = sg_next(sg); + } + + return nents; +} +EXPORT_SYMBOL(sg_nents); + + /** * sg_last - return the last scatterlist entry in a list * @sgl: First entry in the scatterlist -- cgit v1.2.3