diff options
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/base/base.h | 2 | ||||
-rw-r--r-- | drivers/block/aoe/aoecmd.c | 12 | ||||
-rw-r--r-- | drivers/block/aoe/aoenet.c | 1 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 13 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 210 | ||||
-rw-r--r-- | drivers/block/virtio_blk.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 11 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 93 | ||||
-rw-r--r-- | drivers/md/md-bitmap.c | 9 | ||||
-rw-r--r-- | drivers/md/md.c | 162 | ||||
-rw-r--r-- | drivers/md/md.h | 66 | ||||
-rw-r--r-- | drivers/md/raid0.c | 42 | ||||
-rw-r--r-- | drivers/md/raid1.c | 51 | ||||
-rw-r--r-- | drivers/md/raid10.c | 85 | ||||
-rw-r--r-- | drivers/md/raid5-ppl.c | 3 | ||||
-rw-r--r-- | drivers/md/raid5.c | 218 | ||||
-rw-r--r-- | drivers/s390/block/dasd.c | 74 | ||||
-rw-r--r-- | drivers/s390/block/dasd_diag.c | 22 | ||||
-rw-r--r-- | drivers/s390/block/dasd_eckd.c | 29 | ||||
-rw-r--r-- | drivers/s390/block/dasd_fba.c | 33 | ||||
-rw-r--r-- | drivers/s390/block/dasd_genhd.c | 13 | ||||
-rw-r--r-- | drivers/s390/block/dasd_int.h | 6 |
22 files changed, 671 insertions, 488 deletions
diff --git a/drivers/base/base.h b/drivers/base/base.h index eb4c0ace9242..0738ccad08b2 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -207,7 +207,7 @@ static inline int devtmpfs_init(void) { return 0; } #endif #ifdef CONFIG_BLOCK -extern struct class block_class; +extern const struct class block_class; static inline bool is_blockdev(struct device *dev) { return dev->class == &block_class; diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index d7317425be51..cc9077b588d7 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -419,13 +419,16 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu rcu_read_lock(); for_each_netdev_rcu(&init_net, ifp) { dev_hold(ifp); - if (!is_aoe_netif(ifp)) - goto cont; + if (!is_aoe_netif(ifp)) { + dev_put(ifp); + continue; + } skb = new_skb(sizeof *h + sizeof *ch); if (skb == NULL) { printk(KERN_INFO "aoe: skb alloc failure\n"); - goto cont; + dev_put(ifp); + continue; } skb_put(skb, sizeof *h + sizeof *ch); skb->dev = ifp; @@ -440,9 +443,6 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu h->major = cpu_to_be16(aoemajor); h->minor = aoeminor; h->cmd = AOECMD_CFG; - -cont: - dev_put(ifp); } rcu_read_unlock(); } diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index c51ea95bc2ce..923a134fd766 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c @@ -63,6 +63,7 @@ tx(int id) __must_hold(&txlock) pr_warn("aoe: packet could not be sent on %s. %s\n", ifp ? ifp->name : "netif", "consider increasing tx_queue_len"); + dev_put(ifp); spin_lock_irq(&txlock); } return 0; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index cea1e537fd56..113b441d4d36 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2690,6 +2690,14 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig int id; int vnr = adm_ctx->volume; enum drbd_ret_code err = ERR_NOMEM; + struct queue_limits lim = { + /* + * Setting the max_hw_sectors to an odd value of 8kibyte here. + * This triggers a max_bio_size message upon first attach or + * connect. + */ + .max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8, + }; device = minor_to_device(minor); if (device) @@ -2708,7 +2716,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_init_set_defaults(device); - disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + disk = blk_alloc_disk(&lim, NUMA_NO_NODE); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_no_disk; @@ -2729,9 +2737,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); blk_queue_write_cache(disk->queue, true, true); - /* Setting the max_hw_sectors to an odd value of 8kibyte here - This triggers a max_bio_size message upon first attach or connect */ - blk_queue_max_hw_sectors(disk->queue, DRBD_MAX_BIO_SIZE_SAFE >> 8); device->md_io.page = alloc_page(GFP_KERNEL); if (!device->md_io.page) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 43747a1aae43..fbd92803dc1d 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1189,9 +1189,31 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc) return 0; } -static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity) +static unsigned int drbd_max_peer_bio_size(struct drbd_device *device) { - q->limits.discard_granularity = granularity; + /* + * We may ignore peer limits if the peer is modern enough. From 8.3.8 + * onwards the peer can use multiple BIOs for a single peer_request. + */ + if (device->state.conn < C_WF_REPORT_PARAMS) + return device->peer_max_bio_size; + + if (first_peer_device(device)->connection->agreed_pro_version < 94) + return min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + + /* + * Correct old drbd (up to 8.3.7) if it believes it can do more than + * 32KiB. + */ + if (first_peer_device(device)->connection->agreed_pro_version == 94) + return DRBD_MAX_SIZE_H80_PACKET; + + /* + * drbd 8.3.8 onwards, before 8.4.0 + */ + if (first_peer_device(device)->connection->agreed_pro_version < 100) + return DRBD_MAX_BIO_SIZE_P95; + return DRBD_MAX_BIO_SIZE; } static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection) @@ -1204,149 +1226,119 @@ static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection) return AL_EXTENT_SIZE >> 9; } -static void decide_on_discard_support(struct drbd_device *device, +static bool drbd_discard_supported(struct drbd_connection *connection, struct drbd_backing_dev *bdev) { - struct drbd_connection *connection = - first_peer_device(device)->connection; - struct request_queue *q = device->rq_queue; - unsigned int max_discard_sectors; - if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev)) - goto not_supported; + return false; if (connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) { drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n"); - goto not_supported; + return false; } - /* - * We don't care for the granularity, really. - * - * Stacking limits below should fix it for the local device. Whether or - * not it is a suitable granularity on the remote device is not our - * problem, really. If you care, you need to use devices with similar - * topology on all peers. - */ - blk_queue_discard_granularity(q, 512); - max_discard_sectors = drbd_max_discard_sectors(connection); - blk_queue_max_discard_sectors(q, max_discard_sectors); - blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); - return; - -not_supported: - blk_queue_discard_granularity(q, 0); - blk_queue_max_discard_sectors(q, 0); + return true; } -static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q) +/* This is the workaround for "bio would need to, but cannot, be split" */ +static unsigned int drbd_backing_dev_max_segments(struct drbd_device *device) { - /* Fixup max_write_zeroes_sectors after blk_stack_limits(): - * if we can handle "zeroes" efficiently on the protocol, - * we want to do that, even if our backend does not announce - * max_write_zeroes_sectors itself. */ - struct drbd_connection *connection = first_peer_device(device)->connection; - /* If the peer announces WZEROES support, use it. Otherwise, rather - * send explicit zeroes than rely on some discard-zeroes-data magic. */ - if (connection->agreed_features & DRBD_FF_WZEROES) - q->limits.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; - else - q->limits.max_write_zeroes_sectors = 0; -} + unsigned int max_segments; -static void fixup_discard_support(struct drbd_device *device, struct request_queue *q) -{ - unsigned int max_discard = device->rq_queue->limits.max_discard_sectors; - unsigned int discard_granularity = - device->rq_queue->limits.discard_granularity >> SECTOR_SHIFT; + rcu_read_lock(); + max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; + rcu_read_unlock(); - if (discard_granularity > max_discard) { - blk_queue_discard_granularity(q, 0); - blk_queue_max_discard_sectors(q, 0); - } + if (!max_segments) + return BLK_MAX_SEGMENTS; + return max_segments; } -static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, - unsigned int max_bio_size, struct o_qlim *o) +void drbd_reconsider_queue_parameters(struct drbd_device *device, + struct drbd_backing_dev *bdev, struct o_qlim *o) { + struct drbd_connection *connection = + first_peer_device(device)->connection; struct request_queue * const q = device->rq_queue; - unsigned int max_hw_sectors = max_bio_size >> 9; - unsigned int max_segments = 0; + unsigned int now = queue_max_hw_sectors(q) << 9; + struct queue_limits lim; struct request_queue *b = NULL; - struct disk_conf *dc; + unsigned int new; if (bdev) { b = bdev->backing_bdev->bd_disk->queue; - max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); - rcu_read_lock(); - dc = rcu_dereference(device->ldev->disk_conf); - max_segments = dc->max_bio_bvecs; - rcu_read_unlock(); - - blk_set_stacking_limits(&q->limits); + device->local_max_bio_size = + queue_max_hw_sectors(b) << SECTOR_SHIFT; } - blk_queue_max_hw_sectors(q, max_hw_sectors); - /* This is the workaround for "bio would need to, but cannot, be split" */ - blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); - blk_queue_segment_boundary(q, PAGE_SIZE-1); - decide_on_discard_support(device, bdev); - - if (b) { - blk_stack_limits(&q->limits, &b->limits, 0); - disk_update_readahead(device->vdisk); + /* + * We may later detach and re-attach on a disconnected Primary. Avoid + * decreasing the value in this case. + * + * We want to store what we know the peer DRBD can handle, not what the + * peer IO backend can handle. + */ + new = min3(DRBD_MAX_BIO_SIZE, device->local_max_bio_size, + max(drbd_max_peer_bio_size(device), device->peer_max_bio_size)); + if (new != now) { + if (device->state.role == R_PRIMARY && new < now) + drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", + new, now); + drbd_info(device, "max BIO size = %u\n", new); } - fixup_write_zeroes(device, q); - fixup_discard_support(device, q); -} - -void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o) -{ - unsigned int now, new, local, peer; - - now = queue_max_hw_sectors(device->rq_queue) << 9; - local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */ - peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */ + lim = queue_limits_start_update(q); if (bdev) { - local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9; - device->local_max_bio_size = local; + blk_set_stacking_limits(&lim); + lim.max_segments = drbd_backing_dev_max_segments(device); + } else { + lim.max_segments = BLK_MAX_SEGMENTS; } - local = min(local, DRBD_MAX_BIO_SIZE); - /* We may ignore peer limits if the peer is modern enough. - Because new from 8.3.8 onwards the peer can use multiple - BIOs for a single peer_request */ - if (device->state.conn >= C_WF_REPORT_PARAMS) { - if (first_peer_device(device)->connection->agreed_pro_version < 94) - peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); - /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ - else if (first_peer_device(device)->connection->agreed_pro_version == 94) - peer = DRBD_MAX_SIZE_H80_PACKET; - else if (first_peer_device(device)->connection->agreed_pro_version < 100) - peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ - else - peer = DRBD_MAX_BIO_SIZE; + lim.max_hw_sectors = new >> SECTOR_SHIFT; + lim.seg_boundary_mask = PAGE_SIZE - 1; - /* We may later detach and re-attach on a disconnected Primary. - * Avoid this setting to jump back in that case. - * We want to store what we know the peer DRBD can handle, - * not what the peer IO backend can handle. */ - if (peer > device->peer_max_bio_size) - device->peer_max_bio_size = peer; + /* + * We don't care for the granularity, really. + * + * Stacking limits below should fix it for the local device. Whether or + * not it is a suitable granularity on the remote device is not our + * problem, really. If you care, you need to use devices with similar + * topology on all peers. + */ + if (drbd_discard_supported(connection, bdev)) { + lim.discard_granularity = 512; + lim.max_hw_discard_sectors = + drbd_max_discard_sectors(connection); + } else { + lim.discard_granularity = 0; + lim.max_hw_discard_sectors = 0; } - new = min(local, peer); - if (device->state.role == R_PRIMARY && new < now) - drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now); + if (bdev) + blk_stack_limits(&lim, &b->limits, 0); - if (new != now) - drbd_info(device, "max BIO size = %u\n", new); + /* + * If we can handle "zeroes" efficiently on the protocol, we want to do + * that, even if our backend does not announce max_write_zeroes_sectors + * itself. + */ + if (connection->agreed_features & DRBD_FF_WZEROES) + lim.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; + else + lim.max_write_zeroes_sectors = 0; + + if ((lim.discard_granularity >> SECTOR_SHIFT) > + lim.max_hw_discard_sectors) { + lim.discard_granularity = 0; + lim.max_hw_discard_sectors = 0; + } - drbd_setup_queue_param(device, bdev, new, o); + if (queue_limits_commit_update(q, &lim)) + drbd_err(device, "setting new queue limits failed\n"); } /* Starts the worker thread */ diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index d8b55874cd59..aa9f86507719 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -732,12 +732,12 @@ static int virtblk_read_zoned_limits(struct virtio_blk *vblk, virtio_cread(vdev, struct virtio_blk_config, zoned.max_open_zones, &v); - disk_set_max_open_zones(vblk->disk, v); + lim->max_open_zones = v; dev_dbg(&vdev->dev, "max open zones = %u\n", v); virtio_cread(vdev, struct virtio_blk_config, zoned.max_active_zones, &v); - disk_set_max_active_zones(vblk->disk, v); + lim->max_active_zones = v; dev_dbg(&vdev->dev, "max active zones = %u\n", v); virtio_cread(vdev, struct virtio_blk_config, diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d06a9649d302..f716c3265f5c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -913,6 +913,10 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, uint64_t n; int idx; + if (cached_bdev) { + d->stripe_size = bdev_io_opt(cached_bdev) >> SECTOR_SHIFT; + lim.io_opt = umax(block_size, bdev_io_opt(cached_bdev)); + } if (!d->stripe_size) d->stripe_size = 1 << 31; else if (d->stripe_size < BCH_MIN_STRIPE_SZ) @@ -1418,9 +1422,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); } - dc->disk.stripe_size = q->limits.io_opt >> 9; - - if (dc->disk.stripe_size) + if (bdev_io_opt(dc->bdev)) dc->partial_stripes_expensive = q->limits.raid_partial_stripes_expensive; @@ -1430,9 +1432,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) if (ret) return ret; - blk_queue_io_opt(dc->disk.disk->queue, - max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q))); - atomic_set(&dc->io_errors, 0); dc->io_disable = false; dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index eb009d6bb03a..17e9af60bbf7 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -213,6 +213,7 @@ struct raid_dev { #define RT_FLAG_RS_IN_SYNC 6 #define RT_FLAG_RS_RESYNCING 7 #define RT_FLAG_RS_GROW 8 +#define RT_FLAG_RS_FROZEN 9 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -3240,11 +3241,12 @@ size_check: rs->md.ro = 1; rs->md.in_sync = 1; - /* Keep array frozen until resume. */ - set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); - /* Has to be held on running the array */ mddev_suspend_and_lock_nointr(&rs->md); + + /* Keep array frozen until resume. */ + md_frozen_sync_thread(&rs->md); + r = md_run(&rs->md); rs->md.in_sync = 0; /* Assume already marked dirty */ if (r) { @@ -3339,7 +3341,8 @@ static int raid_map(struct dm_target *ti, struct bio *bio) if (unlikely(bio_end_sector(bio) > mddev->array_sectors)) return DM_MAPIO_REQUEUE; - md_handle_request(mddev, bio); + if (unlikely(!md_handle_request(mddev, bio))) + return DM_MAPIO_REQUEUE; return DM_MAPIO_SUBMITTED; } @@ -3718,21 +3721,33 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, { struct raid_set *rs = ti->private; struct mddev *mddev = &rs->md; + int ret = 0; if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; - if (!strcasecmp(argv[0], "frozen")) - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (test_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags) || + test_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags)) + return -EBUSY; - if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); - } - } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) + if (!strcasecmp(argv[0], "frozen")) { + ret = mddev_lock(mddev); + if (ret) + return ret; + + md_frozen_sync_thread(mddev); + mddev_unlock(mddev); + } else if (!strcasecmp(argv[0], "idle")) { + ret = mddev_lock(mddev); + if (ret) + return ret; + + md_idle_sync_thread(mddev); + mddev_unlock(mddev); + } + + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (decipher_sync_action(mddev, mddev->recovery) != st_idle) return -EBUSY; else if (!strcasecmp(argv[0], "resync")) ; /* MD_RECOVERY_NEEDED set below */ @@ -3791,15 +3806,46 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); } +static void raid_presuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + + /* + * From now on, disallow raid_message() to change sync_thread until + * resume, raid_postsuspend() is too late. + */ + set_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); + + if (!reshape_interrupted(mddev)) + return; + + /* + * For raid456, if reshape is interrupted, IO across reshape position + * will never make progress, while caller will wait for IO to be done. + * Inform raid456 to handle those IO to prevent deadlock. + */ + if (mddev->pers && mddev->pers->prepare_suspend) + mddev->pers->prepare_suspend(mddev); +} + +static void raid_presuspend_undo(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); +} + static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { - /* Writes have to be stopped before suspending to avoid deadlocks. */ - if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery)) - md_stop_writes(&rs->md); - + /* + * sync_thread must be stopped during suspend, and writes have + * to be stopped before suspending to avoid deadlocks. + */ + md_stop_writes(&rs->md); mddev_suspend(&rs->md, false); } } @@ -4012,8 +4058,6 @@ static int raid_preresume(struct dm_target *ti) } /* Check for any resize/reshape on @rs and adjust/initiate */ - /* Be prepared for mddev_resume() in raid_resume() */ - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); mddev->resync_min = mddev->recovery_cp; @@ -4047,7 +4091,9 @@ static void raid_resume(struct dm_target *ti) * Take this opportunity to check whether any failed * devices are reachable again. */ + mddev_lock_nointr(mddev); attempt_restore_of_faulty_devices(rs); + mddev_unlock(mddev); } if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { @@ -4055,10 +4101,13 @@ static void raid_resume(struct dm_target *ti) if (mddev->delta_disks < 0) rs_set_capacity(rs); + WARN_ON_ONCE(!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)); + WARN_ON_ONCE(test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); mddev_lock_nointr(mddev); - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); mddev->ro = 0; mddev->in_sync = 0; + md_unfrozen_sync_thread(mddev); mddev_unlock_and_resume(mddev); } } @@ -4074,6 +4123,8 @@ static struct target_type raid_target = { .message = raid_message, .iterate_devices = raid_iterate_devices, .io_hints = raid_io_hints, + .presuspend = raid_presuspend, + .presuspend_undo = raid_presuspend_undo, .postsuspend = raid_postsuspend, .preresume = raid_preresume, .resume = raid_resume, diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index a4976ceae868..059afc24c08b 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1046,9 +1046,8 @@ void md_bitmap_unplug(struct bitmap *bitmap) if (dirty || need_write) { if (!writing) { md_bitmap_wait_writes(bitmap); - if (bitmap->mddev->queue) - blk_add_trace_msg(bitmap->mddev->queue, - "md bitmap_unplug"); + mddev_add_trace_msg(bitmap->mddev, + "md bitmap_unplug"); } clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); filemap_write_page(bitmap, i, false); @@ -1319,9 +1318,7 @@ void md_bitmap_daemon_work(struct mddev *mddev) } bitmap->allclean = 1; - if (bitmap->mddev->queue) - blk_add_trace_msg(bitmap->mddev->queue, - "md bitmap_daemon_work"); + mddev_add_trace_msg(bitmap->mddev, "md bitmap_daemon_work"); /* Any file-page which is PENDING now needs to be written. * So set NEEDWRITE now, then after we make any last-minute changes diff --git a/drivers/md/md.c b/drivers/md/md.c index 48ae2b1cb57a..7d7b982e369c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -65,7 +65,6 @@ #include <linux/percpu-refcount.h> #include <linux/part_stat.h> -#include <trace/events/block.h> #include "md.h" #include "md-bitmap.h" #include "md-cluster.h" @@ -99,18 +98,6 @@ static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); static void md_wakeup_thread_directly(struct md_thread __rcu *thread); -enum md_ro_state { - MD_RDWR, - MD_RDONLY, - MD_AUTO_READ, - MD_MAX_STATE -}; - -static bool md_is_rdwr(struct mddev *mddev) -{ - return (mddev->ro == MD_RDWR); -} - /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error @@ -378,7 +365,7 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio) return true; } -void md_handle_request(struct mddev *mddev, struct bio *bio) +bool md_handle_request(struct mddev *mddev, struct bio *bio) { check_suspended: if (is_suspended(mddev, bio)) { @@ -386,7 +373,7 @@ check_suspended: /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) { bio_wouldblock_error(bio); - return; + return true; } for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, @@ -402,10 +389,13 @@ check_suspended: if (!mddev->pers->make_request(mddev, bio)) { percpu_ref_put(&mddev->active_io); + if (!mddev->gendisk && mddev->pers->prepare_suspend) + return false; goto check_suspended; } percpu_ref_put(&mddev->active_io); + return true; } EXPORT_SYMBOL(md_handle_request); @@ -2420,7 +2410,7 @@ int md_integrity_register(struct mddev *mddev) if (list_empty(&mddev->disks)) return 0; /* nothing to do */ - if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) + if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk)) return 0; /* shouldn't register, or already is */ rdev_for_each(rdev, mddev) { /* skip spares and non-functional disks */ @@ -2473,7 +2463,7 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { struct blk_integrity *bi_mddev; - if (!mddev->gendisk) + if (mddev_is_dm(mddev)) return 0; bi_mddev = blk_get_integrity(mddev->gendisk); @@ -2866,8 +2856,7 @@ repeat: pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", mdname(mddev), mddev->in_sync); - if (mddev->queue) - blk_add_trace_msg(mddev->queue, "md md_update_sb"); + mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: md_bitmap_update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { @@ -4175,7 +4164,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->in_sync = 1; del_timer_sync(&mddev->safemode_timer); } - blk_set_stacking_limits(&mddev->queue->limits); pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->thread) @@ -4942,6 +4930,35 @@ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) mddev_lock_nointr(mddev); } +void md_idle_sync_thread(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev, true, true); +} +EXPORT_SYMBOL_GPL(md_idle_sync_thread); + +void md_frozen_sync_thread(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev, true, false); +} +EXPORT_SYMBOL_GPL(md_frozen_sync_thread); + +void md_unfrozen_sync_thread(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + sysfs_notify_dirent_safe(mddev->sysfs_action); +} +EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); + static void idle_sync_thread(struct mddev *mddev) { mutex_lock(&mddev->sync_mutex); @@ -5733,6 +5750,51 @@ static const struct kobj_type md_ktype = { int mdp_major = 0; +/* stack the limit for all rdevs into lim */ +void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) { + queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, + mddev->gendisk->disk_name); + } +} +EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); + +/* apply the extra stacking limits from a new rdev into mddev */ +int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + struct queue_limits lim; + + if (mddev_is_dm(mddev)) + return 0; + + lim = queue_limits_start_update(mddev->gendisk->queue); + queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, + mddev->gendisk->disk_name); + return queue_limits_commit_update(mddev->gendisk->queue, &lim); +} +EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); + +/* update the optimal I/O size after a reshape */ +void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) +{ + struct queue_limits lim; + + if (mddev_is_dm(mddev)) + return; + + /* don't bother updating io_opt if we can't suspend the array */ + if (mddev_suspend(mddev, false) < 0) + return; + lim = queue_limits_start_update(mddev->gendisk->queue); + lim.io_opt = lim.io_min * nr_stripes; + queue_limits_commit_update(mddev->gendisk->queue, &lim); + mddev_resume(mddev); +} +EXPORT_SYMBOL_GPL(mddev_update_io_opt); + static void mddev_delayed_delete(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); @@ -5815,9 +5877,7 @@ struct mddev *md_alloc(dev_t dev, char *name) disk->fops = &md_fops; disk->private_data = mddev; - mddev->queue = disk->queue; - blk_set_stacking_limits(&mddev->queue->limits); - blk_queue_write_cache(mddev->queue, true, true); + blk_queue_write_cache(disk->queue, true, true); disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; error = add_disk(disk); @@ -5959,7 +6019,7 @@ int md_run(struct mddev *mddev) invalidate_bdev(rdev->bdev); if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { mddev->ro = MD_RDONLY; - if (mddev->gendisk) + if (!mddev_is_dm(mddev)) set_disk_ro(mddev->gendisk, 1); } @@ -6062,7 +6122,10 @@ int md_run(struct mddev *mddev) pr_warn("True protection against single-disk failure might be compromised.\n"); } - mddev->recovery = 0; + /* dm-raid expect sync_thread to be frozen until resume */ + if (mddev->gendisk) + mddev->recovery = 0; + /* may be over-ridden by personality */ mddev->resync_max_sectors = mddev->dev_sectors; @@ -6118,7 +6181,8 @@ int md_run(struct mddev *mddev) } } - if (mddev->queue) { + if (!mddev_is_dm(mddev)) { + struct request_queue *q = mddev->gendisk->queue; bool nonrot = true; rdev_for_each(rdev, mddev) { @@ -6130,14 +6194,14 @@ int md_run(struct mddev *mddev) if (mddev->degraded) nonrot = false; if (nonrot) - blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); + blk_queue_flag_set(QUEUE_FLAG_NONROT, q); else - blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); - blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); + blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); /* Set the NOWAIT flags if all underlying devices support it */ if (nowait) - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); + blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); } if (pers->sync_request) { if (mddev->kobj.sd && @@ -6344,7 +6408,6 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - stop_sync_thread(mddev, true, false); del_timer_sync(&mddev->safemode_timer); if (mddev->pers && mddev->pers->quiesce) { @@ -6369,6 +6432,8 @@ static void __md_stop_writes(struct mddev *mddev) void md_stop_writes(struct mddev *mddev) { mddev_lock_nointr(mddev); + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev, true, false); __md_stop_writes(mddev); mddev_unlock(mddev); } @@ -6382,8 +6447,10 @@ static void mddev_detach(struct mddev *mddev) mddev->pers->quiesce(mddev, 0); } md_unregister_thread(mddev, &mddev->thread); - if (mddev->queue) - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ + + /* the unplug fn references 'conf' */ + if (!mddev_is_dm(mddev)) + blk_sync_queue(mddev->gendisk->queue); } static void __md_stop(struct mddev *mddev) @@ -7101,7 +7168,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) if (!bdev_nowait(rdev->bdev)) { pr_info("%s: Disabling nowait because %pg does not support nowait\n", mdname(mddev), rdev->bdev); - blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); + blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue); } /* * Kick recovery, maybe this spare has to be added to the @@ -7338,10 +7405,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) if (!rv) { if (mddev_is_clustered(mddev)) md_cluster_ops->update_size(mddev, old_dev_sectors); - else if (mddev->queue) { + else if (!mddev_is_dm(mddev)) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); - } } return rv; } @@ -8662,10 +8728,7 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, bio_chain(discard_bio, bio); bio_clone_blkg_association(discard_bio, bio); - if (mddev->gendisk) - trace_block_bio_remap(discard_bio, - disk_devt(mddev->gendisk), - bio->bi_iter.bi_sector); + mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector); submit_bio_noacct(discard_bio); } EXPORT_SYMBOL_GPL(md_submit_discard_bio); @@ -8712,6 +8775,23 @@ void md_account_bio(struct mddev *mddev, struct bio **bio) } EXPORT_SYMBOL_GPL(md_account_bio); +void md_free_cloned_bio(struct bio *bio) +{ + struct md_io_clone *md_io_clone = bio->bi_private; + struct bio *orig_bio = md_io_clone->orig_bio; + struct mddev *mddev = md_io_clone->mddev; + + if (bio->bi_status && !orig_bio->bi_status) + orig_bio->bi_status = bio->bi_status; + + if (md_io_clone->start_time) + bio_end_io_acct(orig_bio, md_io_clone->start_time); + + bio_put(bio); + percpu_ref_put(&mddev->active_io); +} +EXPORT_SYMBOL_GPL(md_free_cloned_bio); + /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before @@ -9141,7 +9221,7 @@ void md_do_sync(struct md_thread *thread) mddev->delta_disks > 0 && mddev->pers->finish_reshape && mddev->pers->size && - mddev->queue) { + !mddev_is_dm(mddev)) { mddev_lock_nointr(mddev); md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); mddev_unlock(mddev); diff --git a/drivers/md/md.h b/drivers/md/md.h index b2076a165c10..67e50c44f4b5 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -18,6 +18,7 @@ #include <linux/timer.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <trace/events/block.h> #include "md-cluster.h" #define MaxSector (~(sector_t)0) @@ -479,7 +480,6 @@ struct mddev { struct timer_list safemode_timer; struct percpu_ref writes_pending; int sync_checkers; /* # of threads checking writes_pending */ - struct request_queue *queue; /* for plugging ... */ struct bitmap *bitmap; /* the bitmap for the device */ struct { @@ -569,6 +569,37 @@ enum recovery_flags { MD_RESYNCING_REMOTE, /* remote node is running resync thread */ }; +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static inline bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + +static inline bool reshape_interrupted(struct mddev *mddev) +{ + /* reshape never start */ + if (mddev->reshape_position == MaxSector) + return false; + + /* interrupted */ + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return true; + + /* running reshape will be interrupted soon. */ + if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || + test_bit(MD_RECOVERY_INTR, &mddev->recovery) || + test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + return true; + + return false; +} + static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); @@ -628,6 +659,7 @@ struct md_personality int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); + void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour @@ -761,6 +793,7 @@ extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size); void md_account_bio(struct mddev *mddev, struct bio **bio); +void md_free_cloned_bio(struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, @@ -789,9 +822,12 @@ extern void md_stop_writes(struct mddev *mddev); extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); -extern void md_handle_request(struct mddev *mddev, struct bio *bio); +extern bool md_handle_request(struct mddev *mddev, struct bio *bio); extern int mddev_suspend(struct mddev *mddev, bool interruptible); extern void mddev_resume(struct mddev *mddev); +extern void md_idle_sync_thread(struct mddev *mddev); +extern void md_frozen_sync_thread(struct mddev *mddev); +extern void md_unfrozen_sync_thread(struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); @@ -832,7 +868,7 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio { if (bio_op(bio) == REQ_OP_WRITE_ZEROES && !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors) - mddev->queue->limits.max_write_zeroes_sectors = 0; + mddev->gendisk->queue->limits.max_write_zeroes_sectors = 0; } static inline int mddev_suspend_and_lock(struct mddev *mddev) @@ -871,7 +907,31 @@ void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); int do_md_run(struct mddev *mddev); +void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim); +int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev); +void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes); extern const struct block_device_operations md_fops; +/* + * MD devices can be used undeneath by DM, in which case ->gendisk is NULL. + */ +static inline bool mddev_is_dm(struct mddev *mddev) +{ + return !mddev->gendisk; +} + +static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio, + sector_t sector) +{ + if (!mddev_is_dm(mddev)) + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector); +} + +#define mddev_add_trace_msg(mddev, fmt, args...) \ +do { \ + if (!mddev_is_dm(mddev)) \ + blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \ +} while (0) + #endif /* _MD_MD_H */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c50a7abda744..c5d4aeb68404 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -379,6 +379,19 @@ static void raid0_free(struct mddev *mddev, void *priv) free_conf(mddev, conf); } +static int raid0_set_limits(struct mddev *mddev) +{ + struct queue_limits lim; + + blk_set_stacking_limits(&lim); + lim.max_hw_sectors = mddev->chunk_sectors; + lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.io_min = mddev->chunk_sectors << 9; + lim.io_opt = lim.io_min * mddev->raid_disks; + mddev_stack_rdev_limits(mddev, &lim); + return queue_limits_set(mddev->gendisk->queue, &lim); +} + static int raid0_run(struct mddev *mddev) { struct r0conf *conf; @@ -399,20 +412,10 @@ static int raid0_run(struct mddev *mddev) mddev->private = conf; } conf = mddev->private; - if (mddev->queue) { - struct md_rdev *rdev; - - blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors); - - blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); - blk_queue_io_opt(mddev->queue, - (mddev->chunk_sectors << 9) * mddev->raid_disks); - - rdev_for_each(rdev, mddev) { - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - } + if (!mddev_is_dm(mddev)) { + ret = raid0_set_limits(mddev); + if (ret) + goto out_free_conf; } /* calculate array device size */ @@ -426,8 +429,10 @@ static int raid0_run(struct mddev *mddev) ret = md_integrity_register(mddev); if (ret) - free_conf(mddev, conf); - + goto out_free_conf; + return 0; +out_free_conf: + free_conf(mddev, conf); return ret; } @@ -578,10 +583,7 @@ static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio) bio_set_dev(bio, tmp_dev->bdev); bio->bi_iter.bi_sector = sector + zone->dev_start + tmp_dev->data_offset; - - if (mddev->gendisk) - trace_block_bio_remap(bio, disk_devt(mddev->gendisk), - bio_sector); + mddev_trace_remap(mddev, bio, bio_sector); mddev_check_write_zeroes(mddev, bio); submit_bio_noacct(bio); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index afca975ec7f3..be8ac24f50b6 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -46,9 +46,6 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr); static void lower_barrier(struct r1conf *conf, sector_t sector_nr); -#define raid1_log(md, fmt, args...) \ - do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) - #define RAID_1_10_NAME "raid1" #include "raid1-10.c" @@ -1196,7 +1193,7 @@ static void freeze_array(struct r1conf *conf, int extra) */ spin_lock_irq(&conf->resync_lock); conf->array_frozen = 1; - raid1_log(conf->mddev, "wait freeze"); + mddev_add_trace_msg(conf->mddev, "raid1 wait freeze"); wait_event_lock_irq_cmd( conf->wait_barrier, get_unqueued_pending(conf) == extra, @@ -1385,7 +1382,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, * Reading from a write-mostly device must take care not to * over-take any writes that are 'behind' */ - raid1_log(mddev, "wait behind writes"); + mddev_add_trace_msg(mddev, "raid1 wait behind writes"); wait_event(bitmap->behind_wait, atomic_read(&bitmap->behind_writes) == 0); } @@ -1418,11 +1415,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, test_bit(R1BIO_FailFast, &r1_bio->state)) read_bio->bi_opf |= MD_FAILFAST; read_bio->bi_private = r1_bio; - - if (mddev->gendisk) - trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), - r1_bio->sector); - + mddev_trace_remap(mddev, read_bio, r1_bio->sector); submit_bio_noacct(read_bio); } @@ -1572,7 +1565,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, bio_wouldblock_error(bio); return; } - raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); + mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked", + blocked_rdev->raid_disk); md_wait_for_blocked_rdev(blocked_rdev, mddev); wait_barrier(conf, bio->bi_iter.bi_sector, false); goto retry_write; @@ -1655,10 +1649,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); - - if (mddev->gendisk) - trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), - r1_bio->sector); + mddev_trace_remap(mddev, mbio, r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_bdev = (void *)rdev; if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) { @@ -1935,12 +1926,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) for (mirror = first; mirror <= last; mirror++) { p = conf->mirrors + mirror; if (!p->rdev) { - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + err = mddev_stack_new_rdev(mddev, rdev); + if (err) + return err; raid1_add_conf(conf, rdev, mirror, false); - err = 0; /* As all devices are equivalent, we don't need a full recovery * if this was recently any drive of the array */ @@ -3204,12 +3194,21 @@ static struct r1conf *setup_conf(struct mddev *mddev) return ERR_PTR(err); } +static int raid1_set_limits(struct mddev *mddev) +{ + struct queue_limits lim; + + blk_set_stacking_limits(&lim); + lim.max_write_zeroes_sectors = 0; + mddev_stack_rdev_limits(mddev, &lim); + return queue_limits_set(mddev->gendisk->queue, &lim); +} + static void raid1_free(struct mddev *mddev, void *priv); static int raid1_run(struct mddev *mddev) { struct r1conf *conf; int i; - struct md_rdev *rdev; int ret; if (mddev->level != 1) { @@ -3236,14 +3235,10 @@ static int raid1_run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); - if (mddev->queue) - blk_queue_max_write_zeroes_sectors(mddev->queue, 0); - - rdev_for_each(rdev, mddev) { - if (!mddev->gendisk) - continue; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + if (!mddev_is_dm(mddev)) { + ret = raid1_set_limits(mddev); + if (ret) + goto abort; } mddev->degraded = 0; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8aecdb1ccc16..b0fd3005f5c1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -76,9 +76,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); static void end_reshape_write(struct bio *bio); static void end_reshape(struct r10conf *conf); -#define raid10_log(md, fmt, args...) \ - do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) - #include "raid1-10.c" #define NULL_CMD @@ -1019,7 +1016,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait) ret = false; } else { conf->nr_waiting++; - raid10_log(conf->mddev, "wait barrier"); + mddev_add_trace_msg(conf->mddev, "raid10 wait barrier"); wait_event_barrier(conf, stop_waiting_barrier(conf)); conf->nr_waiting--; } @@ -1138,7 +1135,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, bio_wouldblock_error(bio); return false; } - raid10_log(conf->mddev, "wait reshape"); + mddev_add_trace_msg(conf->mddev, "raid10 wait reshape"); wait_event(conf->wait_barrier, conf->reshape_progress <= bio->bi_iter.bi_sector || conf->reshape_progress >= bio->bi_iter.bi_sector + @@ -1235,10 +1232,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, test_bit(R10BIO_FailFast, &r10_bio->state)) read_bio->bi_opf |= MD_FAILFAST; read_bio->bi_private = r10_bio; - - if (mddev->gendisk) - trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), - r10_bio->sector); + mddev_trace_remap(mddev, read_bio, r10_bio->sector); submit_bio_noacct(read_bio); return; } @@ -1274,10 +1268,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, && enough(conf, devnum)) mbio->bi_opf |= MD_FAILFAST; mbio->bi_private = r10_bio; - - if (conf->mddev->gendisk) - trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), - r10_bio->sector); + mddev_trace_remap(mddev, mbio, r10_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_bdev = (void *)rdev; @@ -1342,8 +1333,9 @@ retry_wait: if (unlikely(blocked_rdev)) { /* Have to wait for this device to get unblocked, then retry */ allow_barrier(conf); - raid10_log(conf->mddev, "%s wait rdev %d blocked", - __func__, blocked_rdev->raid_disk); + mddev_add_trace_msg(conf->mddev, + "raid10 %s wait rdev %d blocked", + __func__, blocked_rdev->raid_disk); md_wait_for_blocked_rdev(blocked_rdev, mddev); wait_barrier(conf, false); goto retry_wait; @@ -1398,7 +1390,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, bio_wouldblock_error(bio); return; } - raid10_log(conf->mddev, "wait reshape metadata"); + mddev_add_trace_msg(conf->mddev, + "raid10 wait reshape metadata"); wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); @@ -2113,10 +2106,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) continue; } - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - + err = mddev_stack_new_rdev(mddev, rdev); + if (err) + return err; p->head_position = 0; p->recovery_disabled = mddev->recovery_disabled - 1; rdev->raid_disk = mirror; @@ -2132,10 +2124,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(In_sync, &rdev->flags); set_bit(Replacement, &rdev->flags); rdev->raid_disk = repl_slot; - err = 0; - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + err = mddev_stack_new_rdev(mddev, rdev); + if (err) + return err; conf->fullsync = 1; WRITE_ONCE(p->replacement, rdev); } @@ -3976,14 +3967,26 @@ static struct r10conf *setup_conf(struct mddev *mddev) return ERR_PTR(err); } -static void raid10_set_io_opt(struct r10conf *conf) +static unsigned int raid10_nr_stripes(struct r10conf *conf) { - int raid_disks = conf->geo.raid_disks; + unsigned int raid_disks = conf->geo.raid_disks; - if (!(conf->geo.raid_disks % conf->geo.near_copies)) - raid_disks /= conf->geo.near_copies; - blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) * - raid_disks); + if (conf->geo.raid_disks % conf->geo.near_copies) + return raid_disks; + return raid_disks / conf->geo.near_copies; +} + +static int raid10_set_queue_limits(struct mddev *mddev) +{ + struct r10conf *conf = mddev->private; + struct queue_limits lim; + + blk_set_stacking_limits(&lim); + lim.max_write_zeroes_sectors = 0; + lim.io_min = mddev->chunk_sectors << 9; + lim.io_opt = lim.io_min * raid10_nr_stripes(conf); + mddev_stack_rdev_limits(mddev, &lim); + return queue_limits_set(mddev->gendisk->queue, &lim); } static int raid10_run(struct mddev *mddev) @@ -3995,6 +3998,7 @@ static int raid10_run(struct mddev *mddev) sector_t size; sector_t min_offset_diff = 0; int first = 1; + int ret = -EIO; if (mddev->private == NULL) { conf = setup_conf(mddev); @@ -4021,12 +4025,6 @@ static int raid10_run(struct mddev *mddev) } } - if (mddev->queue) { - blk_queue_max_write_zeroes_sectors(mddev->queue, 0); - blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); - raid10_set_io_opt(conf); - } - rdev_for_each(rdev, mddev) { long long diff; @@ -4055,14 +4053,16 @@ static int raid10_run(struct mddev *mddev) if (first || diff < min_offset_diff) min_offset_diff = diff; - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - disk->head_position = 0; first = 0; } + if (!mddev_is_dm(conf->mddev)) { + ret = raid10_set_queue_limits(mddev); + if (ret) + goto out_free_conf; + } + /* need to check that every block has at least one working mirror */ if (!enough(conf, -1)) { pr_err("md/raid10:%s: not enough operational mirrors.\n", @@ -4163,7 +4163,7 @@ out_free_conf: raid10_free_conf(conf); mddev->private = NULL; out: - return -EIO; + return ret; } static void raid10_free(struct mddev *mddev, void *priv) @@ -4940,8 +4940,7 @@ static void end_reshape(struct r10conf *conf) conf->reshape_safe = MaxSector; spin_unlock_irq(&conf->device_lock); - if (conf->mddev->queue) - raid10_set_io_opt(conf); + mddev_update_io_opt(conf->mddev, raid10_nr_stripes(conf)); conf->fullsync = 0; } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index da4ba736c4f0..a70cbec12ed0 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1393,7 +1393,8 @@ int ppl_init_log(struct r5conf *conf) ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); ppl_conf->block_size = 512; } else { - ppl_conf->block_size = queue_logical_block_size(mddev->queue); + ppl_conf->block_size = + queue_logical_block_size(mddev->gendisk->queue); } for (i = 0; i < ppl_conf->count; i++) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 48129de21aec..f2e3c3e2d879 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -36,6 +36,7 @@ */ #include <linux/blkdev.h> +#include <linux/delay.h> #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/async_tx.h> @@ -760,6 +761,7 @@ enum stripe_result { STRIPE_RETRY, STRIPE_SCHEDULE_AND_RETRY, STRIPE_FAIL, + STRIPE_WAIT_RESHAPE, }; struct stripe_request_ctx { @@ -1293,10 +1295,7 @@ again: if (rrdev) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); - if (conf->mddev->gendisk) - trace_block_bio_remap(bi, - disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + mddev_trace_remap(conf->mddev, bi, sh->dev[i].sector); if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, bi); else @@ -1340,10 +1339,7 @@ again: */ if (op == REQ_OP_DISCARD) rbi->bi_vcnt = 0; - if (conf->mddev->gendisk) - trace_block_bio_remap(rbi, - disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + mddev_trace_remap(conf->mddev, rbi, sh->dev[i].sector); if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, rbi); else @@ -2420,12 +2416,12 @@ static int grow_stripes(struct r5conf *conf, int num) size_t namelen = sizeof(conf->cache_name[0]); int devs = max(conf->raid_disks, conf->previous_raid_disks); - if (conf->mddev->gendisk) + if (mddev_is_dm(conf->mddev)) snprintf(conf->cache_name[0], namelen, - "raid%d-%s", conf->level, mdname(conf->mddev)); + "raid%d-%p", conf->level, conf->mddev); else snprintf(conf->cache_name[0], namelen, - "raid%d-%p", conf->level, conf->mddev); + "raid%d-%s", conf->level, mdname(conf->mddev)); snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]); conf->active_name = 0; @@ -4199,10 +4195,9 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_HANDLE, &sh->state); if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ - if (conf->mddev->queue) - blk_add_trace_msg(conf->mddev->queue, - "raid5 rmw %llu %d", - (unsigned long long)sh->sector, rmw); + mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d", + sh->sector, rmw); + for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_InJournal, &dev->flags) && @@ -4279,10 +4274,11 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_DELAYED, &sh->state); } } - if (rcw && conf->mddev->queue) - blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", - (unsigned long long)sh->sector, - rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); + if (rcw && !mddev_is_dm(conf->mddev)) + blk_add_trace_msg(conf->mddev->gendisk->queue, + "raid5 rcw %llu %d %d %d", + (unsigned long long)sh->sector, rcw, qread, + test_bit(STRIPE_DELAYED, &sh->state)); } if (rcw > disks && rmw > disks && @@ -5521,9 +5517,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) spin_unlock_irq(&conf->device_lock); } - if (mddev->gendisk) - trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk), - raid_bio->bi_iter.bi_sector); + mddev_trace_remap(mddev, align_bio, raid_bio->bi_iter.bi_sector); submit_bio_noacct(align_bio); return 1; } @@ -5692,8 +5686,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) } release_inactive_stripe_list(conf, cb->temp_inactive_list, NR_STRIPE_HASH_LOCKS); - if (mddev->queue) - trace_block_unplug(mddev->queue, cnt, !from_schedule); + if (!mddev_is_dm(mddev)) + trace_block_unplug(mddev->gendisk->queue, cnt, !from_schedule); kfree(cb); } @@ -5937,7 +5931,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, if (ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { spin_unlock_irq(&conf->device_lock); - return STRIPE_SCHEDULE_AND_RETRY; + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } } spin_unlock_irq(&conf->device_lock); @@ -6016,6 +6011,12 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, out_release: raid5_release_stripe(sh); +out: + if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) { + bi->bi_status = BLK_STS_RESOURCE; + ret = STRIPE_WAIT_RESHAPE; + pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress"); + } return ret; } @@ -6137,7 +6138,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) while (1) { res = make_stripe_request(mddev, conf, &ctx, logical_sector, bi); - if (res == STRIPE_FAIL) + if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE) break; if (res == STRIPE_RETRY) @@ -6175,6 +6176,11 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if (rw == WRITE) md_write_end(mddev); + if (res == STRIPE_WAIT_RESHAPE) { + md_free_cloned_bio(bi); + return false; + } + bio_endio(bi); return true; } @@ -6764,7 +6770,18 @@ static void raid5d(struct md_thread *thread) spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); + + /* + * Waiting on MD_SB_CHANGE_PENDING below may deadlock + * seeing md_check_recovery() is needed to clear + * the flag when using mdmon. + */ + continue; } + + wait_event_lock_irq(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), + conf->device_lock); } pr_debug("%d stripes handled\n", handled); @@ -7073,7 +7090,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) if (!conf) err = -ENODEV; else if (new != conf->skip_copy) { - struct request_queue *q = mddev->queue; + struct request_queue *q = mddev->gendisk->queue; conf->skip_copy = new; if (new) @@ -7675,10 +7692,65 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded return 0; } -static void raid5_set_io_opt(struct r5conf *conf) +static int raid5_set_limits(struct mddev *mddev) { - blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) * - (conf->raid_disks - conf->max_degraded)); + struct r5conf *conf = mddev->private; + struct queue_limits lim; + int data_disks, stripe; + struct md_rdev *rdev; + + /* + * The read-ahead size must cover two whole stripes, which is + * 2 * (datadisks) * chunksize where 'n' is the number of raid devices. + */ + data_disks = conf->previous_raid_disks - conf->max_degraded; + + /* + * We can only discard a whole stripe. It doesn't make sense to + * discard data disk but write parity disk + */ + stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9)); + + blk_set_stacking_limits(&lim); + lim.io_min = mddev->chunk_sectors << 9; + lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); + lim.raid_partial_stripes_expensive = 1; + lim.discard_granularity = stripe; + lim.max_write_zeroes_sectors = 0; + mddev_stack_rdev_limits(mddev, &lim); + rdev_for_each(rdev, mddev) + queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, + mddev->gendisk->disk_name); + + /* + * Zeroing is required for discard, otherwise data could be lost. + * + * Consider a scenario: discard a stripe (the stripe could be + * inconsistent if discard_zeroes_data is 0); write one disk of the + * stripe (the stripe could be inconsistent again depending on which + * disks are used to calculate parity); the disk is broken; The stripe + * data of this disk is lost. + * + * We only allow DISCARD if the sysadmin has confirmed that only safe + * devices are in use by setting a module parameter. A better idea + * might be to turn DISCARD into WRITE_ZEROES requests, as that is + * required to be safe. + */ + if (!devices_handle_discard_safely || + lim.max_discard_sectors < (stripe >> 9) || + lim.discard_granularity < stripe) + lim.max_hw_discard_sectors = 0; + + /* + * Requests require having a bitmap for each stripe. + * Limit the max sectors based on this. + */ + lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf); + + /* No restrictions on the number of segments in the request */ + lim.max_segments = USHRT_MAX; + + return queue_limits_set(mddev->gendisk->queue, &lim); } static int raid5_run(struct mddev *mddev) @@ -7691,6 +7763,7 @@ static int raid5_run(struct mddev *mddev) int i; long long min_offset_diff = 0; int first = 1; + int ret = -EIO; if (mddev->recovery_cp != MaxSector) pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", @@ -7943,66 +8016,10 @@ static int raid5_run(struct mddev *mddev) mdname(mddev)); md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); - if (mddev->queue) { - int chunk_size; - /* read-ahead size must cover two whole stripes, which - * is 2 * (datadisks) * chunksize where 'n' is the - * number of raid devices - */ - int data_disks = conf->previous_raid_disks - conf->max_degraded; - int stripe = data_disks * - ((mddev->chunk_sectors << 9) / PAGE_SIZE); - - chunk_size = mddev->chunk_sectors << 9; - blk_queue_io_min(mddev->queue, chunk_size); - raid5_set_io_opt(conf); - mddev->queue->limits.raid_partial_stripes_expensive = 1; - /* - * We can only discard a whole stripe. It doesn't make sense to - * discard data disk but write parity disk - */ - stripe = stripe * PAGE_SIZE; - stripe = roundup_pow_of_two(stripe); - mddev->queue->limits.discard_granularity = stripe; - - blk_queue_max_write_zeroes_sectors(mddev->queue, 0); - - rdev_for_each(rdev, mddev) { - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->new_data_offset << 9); - } - - /* - * zeroing is required, otherwise data - * could be lost. Consider a scenario: discard a stripe - * (the stripe could be inconsistent if - * discard_zeroes_data is 0); write one disk of the - * stripe (the stripe could be inconsistent again - * depending on which disks are used to calculate - * parity); the disk is broken; The stripe data of this - * disk is lost. - * - * We only allow DISCARD if the sysadmin has confirmed that - * only safe devices are in use by setting a module parameter. - * A better idea might be to turn DISCARD into WRITE_ZEROES - * requests, as that is required to be safe. - */ - if (!devices_handle_discard_safely || - mddev->queue->limits.max_discard_sectors < (stripe >> 9) || - mddev->queue->limits.discard_granularity < stripe) - blk_queue_max_discard_sectors(mddev->queue, 0); - - /* - * Requests require having a bitmap for each stripe. - * Limit the max sectors based on this. - */ - blk_queue_max_hw_sectors(mddev->queue, - RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf)); - - /* No restrictions on the number of segments in the request */ - blk_queue_max_segments(mddev->queue, USHRT_MAX); + if (!mddev_is_dm(mddev)) { + ret = raid5_set_limits(mddev); + if (ret) + goto abort; } if (log_init(conf, journal_dev, raid5_has_ppl(conf))) @@ -8015,7 +8032,7 @@ abort: free_conf(conf); mddev->private = NULL; pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); - return -EIO; + return ret; } static void raid5_free(struct mddev *mddev, void *priv) @@ -8547,8 +8564,8 @@ static void end_reshape(struct r5conf *conf) spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - if (conf->mddev->queue) - raid5_set_io_opt(conf); + mddev_update_io_opt(conf->mddev, + conf->raid_disks - conf->max_degraded); } } @@ -8925,6 +8942,18 @@ static int raid5_start(struct mddev *mddev) return r5l_start(conf->log); } +/* + * This is only used for dm-raid456, caller already frozen sync_thread, hence + * if rehsape is still in progress, io that is waiting for reshape can never be + * done now, hence wake up and handle those IO. + */ +static void raid5_prepare_suspend(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + wake_up(&conf->wait_for_overlap); +} + static struct md_personality raid6_personality = { .name = "raid6", @@ -8948,6 +8977,7 @@ static struct md_personality raid6_personality = .quiesce = raid5_quiesce, .takeover = raid6_takeover, .change_consistency_policy = raid5_change_consistency_policy, + .prepare_suspend = raid5_prepare_suspend, }; static struct md_personality raid5_personality = { @@ -8972,6 +9002,7 @@ static struct md_personality raid5_personality = .quiesce = raid5_quiesce, .takeover = raid5_takeover, .change_consistency_policy = raid5_change_consistency_policy, + .prepare_suspend = raid5_prepare_suspend, }; static struct md_personality raid4_personality = @@ -8997,6 +9028,7 @@ static struct md_personality raid4_personality = .quiesce = raid5_quiesce, .takeover = raid4_takeover, .change_consistency_policy = raid5_change_consistency_policy, + .prepare_suspend = raid5_prepare_suspend, }; static int __init raid5_init(void) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 2f3adf5d8fee..e8eb710bd25d 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -307,39 +307,57 @@ static int dasd_state_basic_to_known(struct dasd_device *device) */ static int dasd_state_basic_to_ready(struct dasd_device *device) { - int rc; - struct dasd_block *block; - struct gendisk *disk; + struct dasd_block *block = device->block; + struct queue_limits lim; + int rc = 0; - rc = 0; - block = device->block; /* make disk known with correct capacity */ - if (block) { - if (block->base->discipline->do_analysis != NULL) - rc = block->base->discipline->do_analysis(block); - if (rc) { - if (rc != -EAGAIN) { - device->state = DASD_STATE_UNFMT; - disk = device->block->gdp; - kobject_uevent(&disk_to_dev(disk)->kobj, - KOBJ_CHANGE); - goto out; - } - return rc; - } - if (device->discipline->setup_blk_queue) - device->discipline->setup_blk_queue(block); - set_capacity(block->gdp, - block->blocks << block->s2b_shift); + if (!block) { device->state = DASD_STATE_READY; - rc = dasd_scan_partitions(block); - if (rc) { - device->state = DASD_STATE_BASIC; + goto out; + } + + if (block->base->discipline->do_analysis != NULL) + rc = block->base->discipline->do_analysis(block); + if (rc) { + if (rc == -EAGAIN) return rc; - } - } else { - device->state = DASD_STATE_READY; + device->state = DASD_STATE_UNFMT; + kobject_uevent(&disk_to_dev(device->block->gdp)->kobj, + KOBJ_CHANGE); + goto out; } + + lim = queue_limits_start_update(block->gdp->queue); + lim.max_dev_sectors = device->discipline->max_sectors(block); + lim.max_hw_sectors = lim.max_dev_sectors; + lim.logical_block_size = block->bp_block; + + if (device->discipline->has_discard) { + unsigned int max_bytes; + + lim.discard_granularity = block->bp_block; + + /* Calculate max_discard_sectors and make it PAGE aligned */ + max_bytes = USHRT_MAX * block->bp_block; + max_bytes = ALIGN_DOWN(max_bytes, PAGE_SIZE); + + lim.max_hw_discard_sectors = max_bytes / block->bp_block; + lim.max_write_zeroes_sectors = lim.max_hw_discard_sectors; + } + rc = queue_limits_commit_update(block->gdp->queue, &lim); + if (rc) + return rc; + + set_capacity(block->gdp, block->blocks << block->s2b_shift); + device->state = DASD_STATE_READY; + + rc = dasd_scan_partitions(block); + if (rc) { + device->state = DASD_STATE_BASIC; + return rc; + } + out: if (device->discipline->basic_to_ready) rc = device->discipline->basic_to_ready(device); diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index 041088c7e909..ea4b1d01bb76 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -617,25 +617,9 @@ dasd_diag_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req, "dump sense not available for DIAG data"); } -/* - * Initialize block layer request queue. - */ -static void dasd_diag_setup_blk_queue(struct dasd_block *block) +static unsigned int dasd_diag_max_sectors(struct dasd_block *block) { - unsigned int logical_block_size = block->bp_block; - struct request_queue *q = block->gdp->queue; - int max; - - max = DIAG_MAX_BLOCKS << block->s2b_shift; - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - q->limits.max_dev_sectors = max; - blk_queue_logical_block_size(q, logical_block_size); - blk_queue_max_hw_sectors(q, max); - blk_queue_max_segments(q, USHRT_MAX); - /* With page sized segments each segment can be translated into one idaw/tidaw */ - blk_queue_max_segment_size(q, PAGE_SIZE); - blk_queue_segment_boundary(q, PAGE_SIZE - 1); - blk_queue_dma_alignment(q, PAGE_SIZE - 1); + return DIAG_MAX_BLOCKS << block->s2b_shift; } static int dasd_diag_pe_handler(struct dasd_device *device, @@ -648,10 +632,10 @@ static struct dasd_discipline dasd_diag_discipline = { .owner = THIS_MODULE, .name = "DIAG", .ebcname = "DIAG", + .max_sectors = dasd_diag_max_sectors, .check_device = dasd_diag_check_device, .pe_handler = dasd_diag_pe_handler, .fill_geometry = dasd_diag_fill_geometry, - .setup_blk_queue = dasd_diag_setup_blk_queue, .start_IO = dasd_start_diag, .term_IO = dasd_diag_term_IO, .handle_terminated_request = dasd_diag_handle_terminated_request, diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 8aade17d885c..373c1a86c33e 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -6826,17 +6826,9 @@ static void dasd_eckd_handle_hpf_error(struct dasd_device *device, dasd_schedule_requeue(device); } -/* - * Initialize block layer request queue. - */ -static void dasd_eckd_setup_blk_queue(struct dasd_block *block) +static unsigned int dasd_eckd_max_sectors(struct dasd_block *block) { - unsigned int logical_block_size = block->bp_block; - struct request_queue *q = block->gdp->queue; - struct dasd_device *device = block->base; - int max; - - if (device->features & DASD_FEATURE_USERAW) { + if (block->base->features & DASD_FEATURE_USERAW) { /* * the max_blocks value for raw_track access is 256 * it is higher than the native ECKD value because we @@ -6844,19 +6836,10 @@ static void dasd_eckd_setup_blk_queue(struct dasd_block *block) * so the max_hw_sectors are * 2048 x 512B = 1024kB = 16 tracks */ - max = DASD_ECKD_MAX_BLOCKS_RAW << block->s2b_shift; - } else { - max = DASD_ECKD_MAX_BLOCKS << block->s2b_shift; + return DASD_ECKD_MAX_BLOCKS_RAW << block->s2b_shift; } - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - q->limits.max_dev_sectors = max; - blk_queue_logical_block_size(q, logical_block_size); - blk_queue_max_hw_sectors(q, max); - blk_queue_max_segments(q, USHRT_MAX); - /* With page sized segments each segment can be translated into one idaw/tidaw */ - blk_queue_max_segment_size(q, PAGE_SIZE); - blk_queue_segment_boundary(q, PAGE_SIZE - 1); - blk_queue_dma_alignment(q, PAGE_SIZE - 1); + + return DASD_ECKD_MAX_BLOCKS << block->s2b_shift; } static struct ccw_driver dasd_eckd_driver = { @@ -6888,7 +6871,7 @@ static struct dasd_discipline dasd_eckd_discipline = { .basic_to_ready = dasd_eckd_basic_to_ready, .online_to_ready = dasd_eckd_online_to_ready, .basic_to_known = dasd_eckd_basic_to_known, - .setup_blk_queue = dasd_eckd_setup_blk_queue, + .max_sectors = dasd_eckd_max_sectors, .fill_geometry = dasd_eckd_fill_geometry, .start_IO = dasd_start_IO, .term_IO = dasd_term_IO, diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index 045e548630df..bcbb2f8e91fe 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -748,35 +748,9 @@ dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req, free_page((unsigned long) page); } -/* - * Initialize block layer request queue. - */ -static void dasd_fba_setup_blk_queue(struct dasd_block *block) +static unsigned int dasd_fba_max_sectors(struct dasd_block *block) { - unsigned int logical_block_size = block->bp_block; - struct request_queue *q = block->gdp->queue; - unsigned int max_bytes, max_discard_sectors; - int max; - - max = DASD_FBA_MAX_BLOCKS << block->s2b_shift; - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - q->limits.max_dev_sectors = max; - blk_queue_logical_block_size(q, logical_block_size); - blk_queue_max_hw_sectors(q, max); - blk_queue_max_segments(q, USHRT_MAX); - /* With page sized segments each segment can be translated into one idaw/tidaw */ - blk_queue_max_segment_size(q, PAGE_SIZE); - blk_queue_segment_boundary(q, PAGE_SIZE - 1); - - q->limits.discard_granularity = logical_block_size; - - /* Calculate max_discard_sectors and make it PAGE aligned */ - max_bytes = USHRT_MAX * logical_block_size; - max_bytes = ALIGN_DOWN(max_bytes, PAGE_SIZE); - max_discard_sectors = max_bytes / logical_block_size; - - blk_queue_max_discard_sectors(q, max_discard_sectors); - blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); + return DASD_FBA_MAX_BLOCKS << block->s2b_shift; } static int dasd_fba_pe_handler(struct dasd_device *device, @@ -789,10 +763,11 @@ static struct dasd_discipline dasd_fba_discipline = { .owner = THIS_MODULE, .name = "FBA ", .ebcname = "FBA ", + .has_discard = true, .check_device = dasd_fba_check_characteristics, .do_analysis = dasd_fba_do_analysis, .pe_handler = dasd_fba_pe_handler, - .setup_blk_queue = dasd_fba_setup_blk_queue, + .max_sectors = dasd_fba_max_sectors, .fill_geometry = dasd_fba_fill_geometry, .start_IO = dasd_start_IO, .term_IO = dasd_term_IO, diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 0465b706745f..528e2d38d9bf 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -34,6 +34,16 @@ MODULE_PARM_DESC(nr_hw_queues, "Default number of hardware queues for new DASD d */ int dasd_gendisk_alloc(struct dasd_block *block) { + struct queue_limits lim = { + /* + * With page sized segments, each segment can be translated into + * one idaw/tidaw. + */ + .max_segment_size = PAGE_SIZE, + .seg_boundary_mask = PAGE_SIZE - 1, + .dma_alignment = PAGE_SIZE - 1, + .max_segments = USHRT_MAX, + }; struct gendisk *gdp; struct dasd_device *base; int len, rc; @@ -53,11 +63,12 @@ int dasd_gendisk_alloc(struct dasd_block *block) if (rc) return rc; - gdp = blk_mq_alloc_disk(&block->tag_set, NULL, block); + gdp = blk_mq_alloc_disk(&block->tag_set, &lim, block); if (IS_ERR(gdp)) { blk_mq_free_tag_set(&block->tag_set); return PTR_ERR(gdp); } + blk_queue_flag_set(QUEUE_FLAG_NONROT, gdp->queue); /* Initialize gendisk structure. */ gdp->major = DASD_MAJOR; diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index a6c5f1fa2d87..b56d683a991d 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -293,6 +293,7 @@ struct dasd_discipline { struct module *owner; char ebcname[8]; /* a name used for tagging and printks */ char name[8]; /* a name used for tagging and printks */ + bool has_discard; struct list_head list; /* used for list of disciplines */ @@ -331,10 +332,7 @@ struct dasd_discipline { int (*online_to_ready) (struct dasd_device *); int (*basic_to_known)(struct dasd_device *); - /* - * Initialize block layer request queue. - */ - void (*setup_blk_queue)(struct dasd_block *); + unsigned int (*max_sectors)(struct dasd_block *); /* (struct dasd_device *); * Device operation functions. build_cp creates a ccw chain for * a block device request, start_io starts the request and |