summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/xip.txt15
-rw-r--r--Documentation/nommu-mmap.txt8
-rw-r--r--MAINTAINERS4
-rw-r--r--arch/powerpc/sysdev/axonram.c19
-rw-r--r--arch/x86/Makefile1
-rw-r--r--block/bio.c426
-rw-r--r--block/blk-core.c13
-rw-r--r--block/blk-lib.c30
-rw-r--r--block/blk-map.c172
-rw-r--r--block/blk-merge.c41
-rw-r--r--block/blk-mq-tag.c81
-rw-r--r--block/blk-mq-tag.h4
-rw-r--r--block/blk-mq.c13
-rw-r--r--block/blk-tag.c33
-rw-r--r--block/bsg.c72
-rw-r--r--block/cfq-iosched.c16
-rw-r--r--block/ioctl.c2
-rw-r--r--block/partitions/check.c12
-rw-r--r--block/scsi_ioctl.c17
-rw-r--r--drivers/ata/libata-core.c70
-rw-r--r--drivers/ata/libata-scsi.c33
-rw-r--r--drivers/ata/libata.h4
-rw-r--r--drivers/ata/sata_sil24.c1
-rw-r--r--drivers/block/brd.c123
-rw-r--r--drivers/block/drbd/drbd_receiver.c2
-rw-r--r--drivers/block/floppy.c17
-rw-r--r--drivers/block/loop.c416
-rw-r--r--drivers/block/loop.h18
-rw-r--r--drivers/block/null_blk.c2
-rw-r--r--drivers/block/nvme-core.c128
-rw-r--r--drivers/block/osdblk.c2
-rw-r--r--drivers/block/xen-blkback/common.h9
-rw-r--r--drivers/block/xen-blkback/xenbus.c4
-rw-r--r--drivers/block/xen-blkfront.c4
-rw-r--r--drivers/char/mem.c64
-rw-r--r--drivers/char/raw.c4
-rw-r--r--drivers/md/Kconfig5
-rw-r--r--drivers/md/bitmap.c15
-rw-r--r--drivers/md/dm-bufio.c3
-rw-r--r--drivers/md/dm-cache-target.c5
-rw-r--r--drivers/md/dm-ioctl.c4
-rw-r--r--drivers/md/dm-log-userspace-base.c5
-rw-r--r--drivers/md/dm-mpath.c87
-rw-r--r--drivers/md/dm-raid.c24
-rw-r--r--drivers/md/dm-snap-persistent.c14
-rw-r--r--drivers/md/dm-table.c72
-rw-r--r--drivers/md/dm-target.c15
-rw-r--r--drivers/md/dm-thin-metadata.c9
-rw-r--r--drivers/md/dm-thin-metadata.h2
-rw-r--r--drivers/md/dm-thin.c5
-rw-r--r--drivers/md/dm.c346
-rw-r--r--drivers/md/dm.h11
-rw-r--r--drivers/md/faulty.c8
-rw-r--r--drivers/md/linear.c67
-rw-r--r--drivers/md/md.c816
-rw-r--r--drivers/md/md.h57
-rw-r--r--drivers/md/multipath.c22
-rw-r--r--drivers/md/raid0.c29
-rw-r--r--drivers/md/raid1.c52
-rw-r--r--drivers/md/raid1.h3
-rw-r--r--drivers/md/raid10.c49
-rw-r--r--drivers/md/raid10.h3
-rw-r--r--drivers/md/raid5.c334
-rw-r--r--drivers/md/raid5.h1
-rw-r--r--drivers/mtd/mtdchar.c72
-rw-r--r--drivers/mtd/mtdconcat.c10
-rw-r--r--drivers/mtd/mtdcore.c81
-rw-r--r--drivers/mtd/mtdpart.c1
-rw-r--r--drivers/s390/block/dcssblk.c21
-rw-r--r--drivers/scsi/scsi_lib.c2
-rw-r--r--drivers/scsi/scsi_scan.c3
-rw-r--r--drivers/scsi/sg.c15
-rw-r--r--drivers/staging/lustre/lustre/llite/llite_lib.c6
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/afs/volume.c2
-rw-r--r--fs/aio.c13
-rw-r--r--fs/block_dev.c77
-rw-r--r--fs/btrfs/disk-io.c6
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/btrfs/inode.c6
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/ceph/super.c20
-rw-r--r--fs/char_dev.c24
-rw-r--r--fs/cifs/connect.c2
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/configfs/configfs_internal.h2
-rw-r--r--fs/configfs/inode.c17
-rw-r--r--fs/configfs/mount.c11
-rw-r--r--fs/ecryptfs/inode.c1
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/exofs/inode.c2
-rw-r--r--fs/exofs/super.c2
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/xip.c31
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/fs-writeback.c14
-rw-r--r--fs/fuse/file.c10
-rw-r--r--fs/fuse/inode.c1
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/glock.c1
-rw-r--r--fs/gfs2/ops_fstype.c1
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/hugetlbfs/inode.c13
-rw-r--r--fs/inode.c13
-rw-r--r--fs/kernfs/inode.c13
-rw-r--r--fs/kernfs/kernfs-internal.h1
-rw-r--r--fs/kernfs/mount.c1
-rw-r--r--fs/ncpfs/inode.c3
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2
-rw-r--r--fs/nfs/inode.c1
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/nfs4super.c1
-rw-r--r--fs/nfs/super.c24
-rw-r--r--fs/nfs/write.c6
-rw-r--r--fs/nilfs2/gcinode.c1
-rw-r--r--fs/nilfs2/mdt.c6
-rw-r--r--fs/nilfs2/page.c4
-rw-r--r--fs/nilfs2/page.h3
-rw-r--r--fs/nilfs2/super.c6
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c14
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ramfs/file-nommu.c7
-rw-r--r--fs/ramfs/inode.c21
-rw-r--r--fs/romfs/mmap-nommu.c10
-rw-r--r--fs/romfs/super.c3
-rw-r--r--fs/super.c12
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/super.c5
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--include/linux/backing-dev.h53
-rw-r--r--include/linux/bio.h12
-rw-r--r--include/linux/blk-mq.h10
-rw-r--r--include/linux/blkdev.h25
-rw-r--r--include/linux/cdev.h2
-rw-r--r--include/linux/device-mapper.h10
-rw-r--r--include/linux/fs.h28
-rw-r--r--include/linux/libata.h5
-rw-r--r--include/linux/mtd/mtd.h2
-rw-r--r--include/linux/nvme.h3
-rw-r--r--include/linux/wait.h15
-rw-r--r--include/scsi/scsi_host.h3
-rw-r--r--include/scsi/scsi_tcq.h3
-rw-r--r--include/trace/events/writeback.h12
-rw-r--r--include/uapi/linux/dm-ioctl.h4
-rw-r--r--lib/raid6/algos.c2
-rw-r--r--lib/raid6/recov_avx2.c2
-rw-r--r--lib/raid6/recov_ssse3.c6
-rw-r--r--mm/backing-dev.c107
-rw-r--r--mm/fadvise.c4
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/filemap_xip.c3
-rw-r--r--mm/madvise.c17
-rw-r--r--mm/nommu.c69
-rw-r--r--mm/page-writeback.c29
-rw-r--r--mm/readahead.c4
-rw-r--r--mm/shmem.c24
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/vmscan.c4
-rw-r--r--security/security.c13
165 files changed, 2562 insertions, 2524 deletions
diff --git a/Documentation/filesystems/xip.txt b/Documentation/filesystems/xip.txt
index 0466ee569278..b77472949ede 100644
--- a/Documentation/filesystems/xip.txt
+++ b/Documentation/filesystems/xip.txt
@@ -28,12 +28,15 @@ Implementation
Execute-in-place is implemented in three steps: block device operation,
address space operation, and file operations.
-A block device operation named direct_access is used to retrieve a
-reference (pointer) to a block on-disk. The reference is supposed to be
-cpu-addressable, physical address and remain valid until the release operation
-is performed. A struct block_device reference is used to address the device,
-and a sector_t argument is used to identify the individual block. As an
-alternative, memory technology devices can be used for this.
+A block device operation named direct_access is used to translate the
+block device sector number to a page frame number (pfn) that identifies
+the physical page for the memory. It also returns a kernel virtual
+address that can be used to access the memory.
+
+The direct_access method takes a 'size' parameter that indicates the
+number of bytes being requested. The function should return the number
+of bytes that can be contiguously accessed at that offset. It may also
+return a negative errno if an error occurs.
The block device operation is optional, these block devices support it as of
today:
diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt
index 8e1ddec2c78a..ae57b9ea0d41 100644
--- a/Documentation/nommu-mmap.txt
+++ b/Documentation/nommu-mmap.txt
@@ -43,12 +43,12 @@ and it's also much more restricted in the latter case:
even if this was created by another process.
- If possible, the file mapping will be directly on the backing device
- if the backing device has the BDI_CAP_MAP_DIRECT capability and
+ if the backing device has the NOMMU_MAP_DIRECT capability and
appropriate mapping protection capabilities. Ramfs, romfs, cramfs
and mtd might all permit this.
- If the backing device device can't or won't permit direct sharing,
- but does have the BDI_CAP_MAP_COPY capability, then a copy of the
+ but does have the NOMMU_MAP_COPY capability, then a copy of the
appropriate bit of the file will be read into a contiguous bit of
memory and any extraneous space beyond the EOF will be cleared
@@ -220,7 +220,7 @@ directly (can't be copied).
The file->f_op->mmap() operation will be called to actually inaugurate the
mapping. It can be rejected at that point. Returning the ENOSYS error will
-cause the mapping to be copied instead if BDI_CAP_MAP_COPY is specified.
+cause the mapping to be copied instead if NOMMU_MAP_COPY is specified.
The vm_ops->close() routine will be invoked when the last mapping on a chardev
is removed. An existing mapping will be shared, partially or not, if possible
@@ -232,7 +232,7 @@ want to handle it, despite the fact it's got an operation. For instance, it
might try directing the call to a secondary driver which turns out not to
implement it. Such is the case for the framebuffer driver which attempts to
direct the call to the device-specific driver. Under such circumstances, the
-mapping request will be rejected if BDI_CAP_MAP_COPY is not specified, and a
+mapping request will be rejected if NOMMU_MAP_COPY is not specified, and a
copy mapped otherwise.
IMPORTANT NOTE:
diff --git a/MAINTAINERS b/MAINTAINERS
index 22999654195a..482bfc55397d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6642,9 +6642,10 @@ F: include/uapi/linux/netrom.h
F: net/netrom/
NETWORK BLOCK DEVICE (NBD)
-M: Paul Clements <Paul.Clements@steeleye.com>
+M: Markus Pargmann <mpa@pengutronix.de>
S: Maintained
L: nbd-general@lists.sourceforge.net
+T: git git://git.pengutronix.de/git/mpa/linux-nbd.git
F: Documentation/blockdev/nbd.txt
F: drivers/block/nbd.c
F: include/linux/nbd.h
@@ -10690,6 +10691,7 @@ F: drivers/pci/*xen*
XEN BLOCK SUBSYSTEM
M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+M: Roger Pau Monné <roger.pau@citrix.com>
L: xen-devel@lists.xenproject.org (moderated for non-subscribers)
S: Supported
F: drivers/block/xen-blkback/*
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index f532c92bf99d..ee90db17b097 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -139,26 +139,17 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
* axon_ram_direct_access - direct_access() method for block device
* @device, @sector, @data: see block_device_operations method
*/
-static int
+static long
axon_ram_direct_access(struct block_device *device, sector_t sector,
- void **kaddr, unsigned long *pfn)
+ void **kaddr, unsigned long *pfn, long size)
{
struct axon_ram_bank *bank = device->bd_disk->private_data;
- loff_t offset;
-
- offset = sector;
- if (device->bd_part != NULL)
- offset += device->bd_part->start_sect;
- offset <<= AXON_RAM_SECTOR_SHIFT;
- if (offset >= bank->size) {
- dev_err(&bank->device->dev, "Access outside of address space\n");
- return -ERANGE;
- }
+ loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
*kaddr = (void *)(bank->ph_addr + offset);
- *pfn = virt_to_phys(kaddr) >> PAGE_SHIFT;
+ *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT;
- return 0;
+ return bank->size - offset;
}
static const struct block_device_operations axon_ram_devops = {
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 920e6160c535..5ba2d9ce82dc 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -148,6 +148,7 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
# does binutils support specific instructions?
asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
diff --git a/block/bio.c b/block/bio.c
index 471d7382c7d1..f66a4eae16ee 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -28,7 +28,6 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
-#include <scsi/sg.h> /* for struct sg_iovec */
#include <trace/events/block.h>
@@ -1022,21 +1021,11 @@ void bio_copy_data(struct bio *dst, struct bio *src)
EXPORT_SYMBOL(bio_copy_data);
struct bio_map_data {
- int nr_sgvecs;
int is_our_pages;
- struct sg_iovec sgvecs[];
+ struct iov_iter iter;
+ struct iovec iov[];
};
-static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
- const struct sg_iovec *iov, int iov_count,
- int is_our_pages)
-{
- memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
- bmd->nr_sgvecs = iov_count;
- bmd->is_our_pages = is_our_pages;
- bio->bi_private = bmd;
-}
-
static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
gfp_t gfp_mask)
{
@@ -1044,85 +1033,101 @@ static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
return NULL;
return kmalloc(sizeof(struct bio_map_data) +
- sizeof(struct sg_iovec) * iov_count, gfp_mask);
+ sizeof(struct iovec) * iov_count, gfp_mask);
}
-static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count,
- int to_user, int from_user, int do_free_page)
+/**
+ * bio_copy_from_iter - copy all pages from iov_iter to bio
+ * @bio: The &struct bio which describes the I/O as destination
+ * @iter: iov_iter as source
+ *
+ * Copy all pages from iov_iter to bio.
+ * Returns 0 on success, or error on failure.
+ */
+static int bio_copy_from_iter(struct bio *bio, struct iov_iter iter)
{
- int ret = 0, i;
+ int i;
struct bio_vec *bvec;
- int iov_idx = 0;
- unsigned int iov_off = 0;
bio_for_each_segment_all(bvec, bio, i) {
- char *bv_addr = page_address(bvec->bv_page);
- unsigned int bv_len = bvec->bv_len;
+ ssize_t ret;
- while (bv_len && iov_idx < iov_count) {
- unsigned int bytes;
- char __user *iov_addr;
+ ret = copy_page_from_iter(bvec->bv_page,
+ bvec->bv_offset,
+ bvec->bv_len,
+ &iter);
- bytes = min_t(unsigned int,
- iov[iov_idx].iov_len - iov_off, bv_len);
- iov_addr = iov[iov_idx].iov_base + iov_off;
+ if (!iov_iter_count(&iter))
+ break;
- if (!ret) {
- if (to_user)
- ret = copy_to_user(iov_addr, bv_addr,
- bytes);
+ if (ret < bvec->bv_len)
+ return -EFAULT;
+ }
- if (from_user)
- ret = copy_from_user(bv_addr, iov_addr,
- bytes);
+ return 0;
+}
- if (ret)
- ret = -EFAULT;
- }
+/**
+ * bio_copy_to_iter - copy all pages from bio to iov_iter
+ * @bio: The &struct bio which describes the I/O as source
+ * @iter: iov_iter as destination
+ *
+ * Copy all pages from bio to iov_iter.
+ * Returns 0 on success, or error on failure.
+ */
+static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
+{
+ int i;
+ struct bio_vec *bvec;
- bv_len -= bytes;
- bv_addr += bytes;
- iov_addr += bytes;
- iov_off += bytes;
+ bio_for_each_segment_all(bvec, bio, i) {
+ ssize_t ret;
- if (iov[iov_idx].iov_len == iov_off) {
- iov_idx++;
- iov_off = 0;
- }
- }
+ ret = copy_page_to_iter(bvec->bv_page,
+ bvec->bv_offset,
+ bvec->bv_len,
+ &iter);
+
+ if (!iov_iter_count(&iter))
+ break;
- if (do_free_page)
- __free_page(bvec->bv_page);
+ if (ret < bvec->bv_len)
+ return -EFAULT;
}
- return ret;
+ return 0;
+}
+
+static void bio_free_pages(struct bio *bio)
+{
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i)
+ __free_page(bvec->bv_page);
}
/**
* bio_uncopy_user - finish previously mapped bio
* @bio: bio being terminated
*
- * Free pages allocated from bio_copy_user() and write back data
+ * Free pages allocated from bio_copy_user_iov() and write back data
* to user space in case of a read.
*/
int bio_uncopy_user(struct bio *bio)
{
struct bio_map_data *bmd = bio->bi_private;
- struct bio_vec *bvec;
- int ret = 0, i;
+ int ret = 0;
if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
/*
* if we're in a workqueue, the request is orphaned, so
* don't copy into a random user address space, just free.
*/
- if (current->mm)
- ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs,
- bio_data_dir(bio) == READ,
- 0, bmd->is_our_pages);
- else if (bmd->is_our_pages)
- bio_for_each_segment_all(bvec, bio, i)
- __free_page(bvec->bv_page);
+ if (current->mm && bio_data_dir(bio) == READ)
+ ret = bio_copy_to_iter(bio, bmd->iter);
+ if (bmd->is_our_pages)
+ bio_free_pages(bio);
}
kfree(bmd);
bio_put(bio);
@@ -1132,12 +1137,10 @@ EXPORT_SYMBOL(bio_uncopy_user);
/**
* bio_copy_user_iov - copy user data to bio
- * @q: destination block queue
- * @map_data: pointer to the rq_map_data holding pages (if necessary)
- * @iov: the iovec.
- * @iov_count: number of elements in the iovec
- * @write_to_vm: bool indicating writing to pages or not
- * @gfp_mask: memory allocation flags
+ * @q: destination block queue
+ * @map_data: pointer to the rq_map_data holding pages (if necessary)
+ * @iter: iovec iterator
+ * @gfp_mask: memory allocation flags
*
* Prepares and returns a bio for indirect user io, bouncing data
* to/from kernel pages as necessary. Must be paired with
@@ -1145,25 +1148,25 @@ EXPORT_SYMBOL(bio_uncopy_user);
*/
struct bio *bio_copy_user_iov(struct request_queue *q,
struct rq_map_data *map_data,
- const struct sg_iovec *iov, int iov_count,
- int write_to_vm, gfp_t gfp_mask)
+ const struct iov_iter *iter,
+ gfp_t gfp_mask)
{
struct bio_map_data *bmd;
- struct bio_vec *bvec;
struct page *page;
struct bio *bio;
int i, ret;
int nr_pages = 0;
- unsigned int len = 0;
+ unsigned int len = iter->count;
unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
- for (i = 0; i < iov_count; i++) {
+ for (i = 0; i < iter->nr_segs; i++) {
unsigned long uaddr;
unsigned long end;
unsigned long start;
- uaddr = (unsigned long)iov[i].iov_base;
- end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ uaddr = (unsigned long) iter->iov[i].iov_base;
+ end = (uaddr + iter->iov[i].iov_len + PAGE_SIZE - 1)
+ >> PAGE_SHIFT;
start = uaddr >> PAGE_SHIFT;
/*
@@ -1173,22 +1176,31 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
return ERR_PTR(-EINVAL);
nr_pages += end - start;
- len += iov[i].iov_len;
}
if (offset)
nr_pages++;
- bmd = bio_alloc_map_data(iov_count, gfp_mask);
+ bmd = bio_alloc_map_data(iter->nr_segs, gfp_mask);
if (!bmd)
return ERR_PTR(-ENOMEM);
+ /*
+ * We need to do a deep copy of the iov_iter including the iovecs.
+ * The caller provided iov might point to an on-stack or otherwise
+ * shortlived one.
+ */
+ bmd->is_our_pages = map_data ? 0 : 1;
+ memcpy(bmd->iov, iter->iov, sizeof(struct iovec) * iter->nr_segs);
+ iov_iter_init(&bmd->iter, iter->type, bmd->iov,
+ iter->nr_segs, iter->count);
+
ret = -ENOMEM;
bio = bio_kmalloc(gfp_mask, nr_pages);
if (!bio)
goto out_bmd;
- if (!write_to_vm)
+ if (iter->type & WRITE)
bio->bi_rw |= REQ_WRITE;
ret = 0;
@@ -1236,20 +1248,18 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
/*
* success
*/
- if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
+ if (((iter->type & WRITE) && (!map_data || !map_data->null_mapped)) ||
(map_data && map_data->from_user)) {
- ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0);
+ ret = bio_copy_from_iter(bio, *iter);
if (ret)
goto cleanup;
}
- bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
+ bio->bi_private = bmd;
return bio;
cleanup:
if (!map_data)
- bio_for_each_segment_all(bvec, bio, i)
- __free_page(bvec->bv_page);
-
+ bio_free_pages(bio);
bio_put(bio);
out_bmd:
kfree(bmd);
@@ -1257,46 +1267,30 @@ out_bmd:
}
/**
- * bio_copy_user - copy user data to bio
- * @q: destination block queue
- * @map_data: pointer to the rq_map_data holding pages (if necessary)
- * @uaddr: start of user address
- * @len: length in bytes
- * @write_to_vm: bool indicating writing to pages or not
- * @gfp_mask: memory allocation flags
+ * bio_map_user_iov - map user iovec into bio
+ * @q: the struct request_queue for the bio
+ * @iter: iovec iterator
+ * @gfp_mask: memory allocation flags
*
- * Prepares and returns a bio for indirect user io, bouncing data
- * to/from kernel pages as necessary. Must be paired with
- * call bio_uncopy_user() on io completion.
+ * Map the user space address into a bio suitable for io to a block
+ * device. Returns an error pointer in case of error.
*/
-struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
- unsigned long uaddr, unsigned int len,
- int write_to_vm, gfp_t gfp_mask)
+struct bio *bio_map_user_iov(struct request_queue *q,
+ const struct iov_iter *iter,
+ gfp_t gfp_mask)
{
- struct sg_iovec iov;
-
- iov.iov_base = (void __user *)uaddr;
- iov.iov_len = len;
-
- return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_copy_user);
-
-static struct bio *__bio_map_user_iov(struct request_queue *q,
- struct block_device *bdev,
- const struct sg_iovec *iov, int iov_count,
- int write_to_vm, gfp_t gfp_mask)
-{
- int i, j;
+ int j;
int nr_pages = 0;
struct page **pages;
struct bio *bio;
int cur_page = 0;
int ret, offset;
+ struct iov_iter i;
+ struct iovec iov;
- for (i = 0; i < iov_count; i++) {
- unsigned long uaddr = (unsigned long)iov[i].iov_base;
- unsigned long len = iov[i].iov_len;
+ iov_for_each(iov, i, *iter) {
+ unsigned long uaddr = (unsigned long) iov.iov_base;
+ unsigned long len = iov.iov_len;
unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long start = uaddr >> PAGE_SHIFT;
@@ -1326,16 +1320,17 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
if (!pages)
goto out;
- for (i = 0; i < iov_count; i++) {
- unsigned long uaddr = (unsigned long)iov[i].iov_base;
- unsigned long len = iov[i].iov_len;
+ iov_for_each(iov, i, *iter) {
+ unsigned long uaddr = (unsigned long) iov.iov_base;
+ unsigned long len = iov.iov_len;
unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long start = uaddr >> PAGE_SHIFT;
const int local_nr_pages = end - start;
const int page_limit = cur_page + local_nr_pages;
ret = get_user_pages_fast(uaddr, local_nr_pages,
- write_to_vm, &pages[cur_page]);
+ (iter->type & WRITE) != WRITE,
+ &pages[cur_page]);
if (ret < local_nr_pages) {
ret = -EFAULT;
goto out_unmap;
@@ -1375,72 +1370,10 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
/*
* set data direction, and check if mapped pages need bouncing
*/
- if (!write_to_vm)
+ if (iter->type & WRITE)
bio->bi_rw |= REQ_WRITE;
- bio->bi_bdev = bdev;
bio->bi_flags |= (1 << BIO_USER_MAPPED);
- return bio;
-
- out_unmap:
- for (i = 0; i < nr_pages; i++) {
- if(!pages[i])
- break;
- page_cache_release(pages[i]);
- }
- out:
- kfree(pages);
- bio_put(bio);
- return ERR_PTR(ret);
-}
-
-/**
- * bio_map_user - map user address into bio
- * @q: the struct request_queue for the bio
- * @bdev: destination block device
- * @uaddr: start of user address
- * @len: length in bytes
- * @write_to_vm: bool indicating writing to pages or not
- * @gfp_mask: memory allocation flags
- *
- * Map the user space address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
- unsigned long uaddr, unsigned int len, int write_to_vm,
- gfp_t gfp_mask)
-{
- struct sg_iovec iov;
-
- iov.iov_base = (void __user *)uaddr;
- iov.iov_len = len;
-
- return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_map_user);
-
-/**
- * bio_map_user_iov - map user sg_iovec table into bio
- * @q: the struct request_queue for the bio
- * @bdev: destination block device
- * @iov: the iovec.
- * @iov_count: number of elements in the iovec
- * @write_to_vm: bool indicating writing to pages or not
- * @gfp_mask: memory allocation flags
- *
- * Map the user space address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
- const struct sg_iovec *iov, int iov_count,
- int write_to_vm, gfp_t gfp_mask)
-{
- struct bio *bio;
-
- bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
- gfp_mask);
- if (IS_ERR(bio))
- return bio;
/*
* subtle -- if __bio_map_user() ended up bouncing a bio,
@@ -1449,8 +1382,18 @@ struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
* reference to it
*/
bio_get(bio);
-
return bio;
+
+ out_unmap:
+ for (j = 0; j < nr_pages; j++) {
+ if (!pages[j])
+ break;
+ page_cache_release(pages[j]);
+ }
+ out:
+ kfree(pages);
+ bio_put(bio);
+ return ERR_PTR(ret);
}
static void __bio_unmap_user(struct bio *bio)
@@ -1492,8 +1435,18 @@ static void bio_map_kern_endio(struct bio *bio, int err)
bio_put(bio);
}
-static struct bio *__bio_map_kern(struct request_queue *q, void *data,
- unsigned int len, gfp_t gfp_mask)
+/**
+ * bio_map_kern - map kernel address into bio
+ * @q: the struct request_queue for the bio
+ * @data: pointer to buffer to map
+ * @len: length in bytes
+ * @gfp_mask: allocation flags for bio allocation
+ *
+ * Map the kernel address into a bio suitable for io to a block
+ * device. Returns an error pointer in case of error.
+ */
+struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
+ gfp_t gfp_mask)
{
unsigned long kaddr = (unsigned long)data;
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -1517,8 +1470,11 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data,
bytes = len;
if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
- offset) < bytes)
- break;
+ offset) < bytes) {
+ /* we don't support partial mappings */
+ bio_put(bio);
+ return ERR_PTR(-EINVAL);
+ }
data += bytes;
len -= bytes;
@@ -1528,57 +1484,26 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data,
bio->bi_end_io = bio_map_kern_endio;
return bio;
}
+EXPORT_SYMBOL(bio_map_kern);
-/**
- * bio_map_kern - map kernel address into bio
- * @q: the struct request_queue for the bio
- * @data: pointer to buffer to map
- * @len: length in bytes
- * @gfp_mask: allocation flags for bio allocation
- *
- * Map the kernel address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
- gfp_t gfp_mask)
+static void bio_copy_kern_endio(struct bio *bio, int err)
{
- struct bio *bio;
-
- bio = __bio_map_kern(q, data, len, gfp_mask);
- if (IS_ERR(bio))
- return bio;
-
- if (bio->bi_iter.bi_size == len)
- return bio;
-
- /*
- * Don't support partial mappings.
- */
+ bio_free_pages(bio);
bio_put(bio);
- return ERR_PTR(-EINVAL);
}
-EXPORT_SYMBOL(bio_map_kern);
-static void bio_copy_kern_endio(struct bio *bio, int err)
+static void bio_copy_kern_endio_read(struct bio *bio, int err)
{
+ char *p = bio->bi_private;
struct bio_vec *bvec;
- const int read = bio_data_dir(bio) == READ;
- struct bio_map_data *bmd = bio->bi_private;
int i;
- char *p = bmd->sgvecs[0].iov_base;
bio_for_each_segment_all(bvec, bio, i) {
- char *addr = page_address(bvec->bv_page);
-
- if (read)
- memcpy(p, addr, bvec->bv_len);
-
- __free_page(bvec->bv_page);
+ memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
p += bvec->bv_len;
}
- kfree(bmd);
- bio_put(bio);
+ bio_copy_kern_endio(bio, err);
}
/**
@@ -1595,28 +1520,59 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
gfp_t gfp_mask, int reading)
{
+ unsigned long kaddr = (unsigned long)data;
+ unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ unsigned long start = kaddr >> PAGE_SHIFT;
struct bio *bio;
- struct bio_vec *bvec;
- int i;
+ void *p = data;
+ int nr_pages = 0;
+
+ /*
+ * Overflow, abort
+ */
+ if (end < start)
+ return ERR_PTR(-EINVAL);
- bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
- if (IS_ERR(bio))
- return bio;
+ nr_pages = end - start;
+ bio = bio_kmalloc(gfp_mask, nr_pages);
+ if (!bio)
+ return ERR_PTR(-ENOMEM);
- if (!reading) {
- void *p = data;
+ while (len) {
+ struct page *page;
+ unsigned int bytes = PAGE_SIZE;
- bio_for_each_segment_all(bvec, bio, i) {
- char *addr = page_address(bvec->bv_page);
+ if (bytes > len)
+ bytes = len;
- memcpy(addr, p, bvec->bv_len);
- p += bvec->bv_len;
- }
+ page = alloc_page(q->bounce_gfp | gfp_mask);
+ if (!page)
+ goto cleanup;
+
+ if (!reading)
+ memcpy(page_address(page), p, bytes);
+
+ if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
+ break;
+
+ len -= bytes;
+ p += bytes;
}
- bio->bi_end_io = bio_copy_kern_endio;
+ if (reading) {
+ bio->bi_end_io = bio_copy_kern_endio_read;
+ bio->bi_private = data;
+ } else {
+ bio->bi_end_io = bio_copy_kern_endio;
+ bio->bi_rw |= REQ_WRITE;
+ }
return bio;
+
+cleanup:
+ bio_free_pages(bio);
+ bio_put(bio);
+ return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(bio_copy_kern);
diff --git a/block/blk-core.c b/block/blk-core.c
index 3ad405571dcc..794c3e7f01cf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -607,7 +607,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.state = 0;
- q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
+ q->backing_dev_info.capabilities = 0;
q->backing_dev_info.name = "block";
q->node = node_id;
@@ -2048,6 +2048,13 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
return -EIO;
+ if (q->mq_ops) {
+ if (blk_queue_io_stat(q))
+ blk_account_io_start(rq, true);
+ blk_mq_insert_request(rq, false, true, true);
+ return 0;
+ }
+
spin_lock_irqsave(q->queue_lock, flags);
if (unlikely(blk_queue_dying(q))) {
spin_unlock_irqrestore(q->queue_lock, flags);
@@ -2907,7 +2914,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
static void __blk_rq_prep_clone(struct request *dst, struct request *src)
{
dst->cpu = src->cpu;
- dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
+ dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
dst->cmd_type = src->cmd_type;
dst->__sector = blk_rq_pos(src);
dst->__data_len = blk_rq_bytes(src);
@@ -2945,8 +2952,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
if (!bs)
bs = fs_bio_set;
- blk_rq_init(NULL, rq);
-
__rq_for_each_bio(bio_src, rq_src) {
bio = bio_clone_fast(bio_src, gfp_mask, bs);
if (!bio)
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 8411be3c19d3..7688ee3f5d72 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -283,24 +283,34 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
+ * @discard: whether to discard the block range
*
* Description:
- * Generate and issue number of bios with zerofiled pages.
+ * Zero-fill a block range. If the discard flag is set and the block
+ * device guarantees that subsequent READ operations to the block range
+ * in question will return zeroes, the blocks will be discarded. Should
+ * the discard request fail, if the discard flag is not set, or if
+ * discard_zeroes_data is not supported, this function will resort to
+ * zeroing the blocks manually, thus provisioning (allocating,
+ * anchoring) them. If the block device supports the WRITE SAME command
+ * blkdev_issue_zeroout() will use it to optimize the process of
+ * clearing the block range. Otherwise the zeroing will be performed
+ * using regular WRITE calls.
*/
int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask)
+ sector_t nr_sects, gfp_t gfp_mask, bool discard)
{
- if (bdev_write_same(bdev)) {
- unsigned char bdn[BDEVNAME_SIZE];
+ struct request_queue *q = bdev_get_queue(bdev);
- if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
- ZERO_PAGE(0)))
- return 0;
+ if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data &&
+ blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0) == 0)
+ return 0;
- bdevname(bdev, bdn);
- pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn);
- }
+ if (bdev_write_same(bdev) &&
+ blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
+ ZERO_PAGE(0)) == 0)
+ return 0;
return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
}
diff --git a/block/blk-map.c b/block/blk-map.c
index f890d4345b0c..b8d2725324a6 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -5,7 +5,7 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
-#include <scsi/sg.h> /* for struct sg_iovec */
+#include <linux/uio.h>
#include "blk.h"
@@ -39,138 +39,12 @@ static int __blk_rq_unmap_user(struct bio *bio)
return ret;
}
-static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
- struct rq_map_data *map_data, void __user *ubuf,
- unsigned int len, gfp_t gfp_mask)
-{
- unsigned long uaddr;
- struct bio *bio, *orig_bio;
- int reading, ret;
-
- reading = rq_data_dir(rq) == READ;
-
- /*
- * if alignment requirement is satisfied, map in user pages for
- * direct dma. else, set up kernel bounce buffers
- */
- uaddr = (unsigned long) ubuf;
- if (blk_rq_aligned(q, uaddr, len) && !map_data)
- bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
- else
- bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
-
- if (IS_ERR(bio))
- return PTR_ERR(bio);
-
- if (map_data && map_data->null_mapped)
- bio->bi_flags |= (1 << BIO_NULL_MAPPED);
-
- orig_bio = bio;
- blk_queue_bounce(q, &bio);
-
- /*
- * We link the bounce buffer in and could have to traverse it
- * later so we have to get a ref to prevent it from being freed
- */
- bio_get(bio);
-
- ret = blk_rq_append_bio(q, rq, bio);
- if (!ret)
- return bio->bi_iter.bi_size;
-
- /* if it was boucned we must call the end io function */
- bio_endio(bio, 0);
- __blk_rq_unmap_user(orig_bio);
- bio_put(bio);
- return ret;
-}
-
-/**
- * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage
- * @q: request queue where request should be inserted
- * @rq: request structure to fill
- * @map_data: pointer to the rq_map_data holding pages (if necessary)
- * @ubuf: the user buffer
- * @len: length of user data
- * @gfp_mask: memory allocation flags
- *
- * Description:
- * Data will be mapped directly for zero copy I/O, if possible. Otherwise
- * a kernel bounce buffer is used.
- *
- * A matching blk_rq_unmap_user() must be issued at the end of I/O, while
- * still in process context.
- *
- * Note: The mapped bio may need to be bounced through blk_queue_bounce()
- * before being submitted to the device, as pages mapped may be out of
- * reach. It's the callers responsibility to make sure this happens. The
- * original bio must be passed back in to blk_rq_unmap_user() for proper
- * unmapping.
- */
-int blk_rq_map_user(struct request_queue *q, struct request *rq,
- struct rq_map_data *map_data, void __user *ubuf,
- unsigned long len, gfp_t gfp_mask)
-{
- unsigned long bytes_read = 0;
- struct bio *bio = NULL;
- int ret;
-
- if (len > (queue_max_hw_sectors(q) << 9))
- return -EINVAL;
- if (!len)
- return -EINVAL;
-
- if (!ubuf && (!map_data || !map_data->null_mapped))
- return -EINVAL;
-
- while (bytes_read != len) {
- unsigned long map_len, end, start;
-
- map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);
- end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)
- >> PAGE_SHIFT;
- start = (unsigned long)ubuf >> PAGE_SHIFT;
-
- /*
- * A bad offset could cause us to require BIO_MAX_PAGES + 1
- * pages. If this happens we just lower the requested
- * mapping len by a page so that we can fit
- */
- if (end - start > BIO_MAX_PAGES)
- map_len -= PAGE_SIZE;
-
- ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len,
- gfp_mask);
- if (ret < 0)
- goto unmap_rq;
- if (!bio)
- bio = rq->bio;
- bytes_read += ret;
- ubuf += ret;
-
- if (map_data)
- map_data->offset += ret;
- }
-
- if (!bio_flagged(bio, BIO_USER_MAPPED))
- rq->cmd_flags |= REQ_COPY_USER;
-
- return 0;
-unmap_rq:
- blk_rq_unmap_user(bio);
- rq->bio = NULL;
- return ret;
-}
-EXPORT_SYMBOL(blk_rq_map_user);
-
/**
* blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
* @q: request queue where request should be inserted
* @rq: request to map data to
* @map_data: pointer to the rq_map_data holding pages (if necessary)
- * @iov: pointer to the iovec
- * @iov_count: number of elements in the iovec
- * @len: I/O byte count
+ * @iter: iovec iterator
* @gfp_mask: memory allocation flags
*
* Description:
@@ -187,20 +61,21 @@ EXPORT_SYMBOL(blk_rq_map_user);
* unmapping.
*/
int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
- struct rq_map_data *map_data, const struct sg_iovec *iov,
- int iov_count, unsigned int len, gfp_t gfp_mask)
+ struct rq_map_data *map_data,
+ const struct iov_iter *iter, gfp_t gfp_mask)
{
struct bio *bio;
- int i, read = rq_data_dir(rq) == READ;
int unaligned = 0;
+ struct iov_iter i;
+ struct iovec iov;
- if (!iov || iov_count <= 0)
+ if (!iter || !iter->count)
return -EINVAL;
- for (i = 0; i < iov_count; i++) {
- unsigned long uaddr = (unsigned long)iov[i].iov_base;
+ iov_for_each(iov, i, *iter) {
+ unsigned long uaddr = (unsigned long) iov.iov_base;
- if (!iov[i].iov_len)
+ if (!iov.iov_len)
return -EINVAL;
/*
@@ -210,16 +85,18 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
unaligned = 1;
}
- if (unaligned || (q->dma_pad_mask & len) || map_data)
- bio = bio_copy_user_iov(q, map_data, iov, iov_count, read,
- gfp_mask);
+ if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
+ bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
else
- bio = bio_map_user_iov(q, NULL, iov, iov_count, read, gfp_mask);
+ bio = bio_map_user_iov(q, iter, gfp_mask);
if (IS_ERR(bio))
return PTR_ERR(bio);
- if (bio->bi_iter.bi_size != len) {
+ if (map_data && map_data->null_mapped)
+ bio->bi_flags |= (1 << BIO_NULL_MAPPED);
+
+ if (bio->bi_iter.bi_size != iter->count) {
/*
* Grab an extra reference to this bio, as bio_unmap_user()
* expects to be able to drop it twice as it happens on the
@@ -241,6 +118,21 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL(blk_rq_map_user_iov);
+int blk_rq_map_user(struct request_queue *q, struct request *rq,
+ struct rq_map_data *map_data, void __user *ubuf,
+ unsigned long len, gfp_t gfp_mask)
+{
+ struct iovec iov;
+ struct iov_iter i;
+
+ iov.iov_base = ubuf;
+ iov.iov_len = len;
+ iov_iter_init(&i, rq_data_dir(rq), &iov, 1, len);
+
+ return blk_rq_map_user_iov(q, rq, map_data, &i, gfp_mask);
+}
+EXPORT_SYMBOL(blk_rq_map_user);
+
/**
* blk_rq_unmap_user - unmap a request with user data
* @bio: start of bio list
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 89b97b5e0881..fc1ff3b1ea1f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -283,35 +283,6 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL(blk_rq_map_sg);
-/**
- * blk_bio_map_sg - map a bio to a scatterlist
- * @q: request_queue in question
- * @bio: bio being mapped
- * @sglist: scatterlist being mapped
- *
- * Note:
- * Caller must make sure sg can hold bio->bi_phys_segments entries
- *
- * Will return the number of sg entries setup
- */
-int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
- struct scatterlist *sglist)
-{
- struct scatterlist *sg = NULL;
- int nsegs;
- struct bio *next = bio->bi_next;
- bio->bi_next = NULL;
-
- nsegs = __blk_bios_map_sg(q, bio, sglist, &sg);
- bio->bi_next = next;
- if (sg)
- sg_mark_end(sg);
-
- BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
- return nsegs;
-}
-EXPORT_SYMBOL(blk_bio_map_sg);
-
static inline int ll_new_hw_segment(struct request_queue *q,
struct request *req,
struct bio *bio)
@@ -385,6 +356,14 @@ static bool req_no_special_merge(struct request *req)
return !q->mq_ops && req->special;
}
+static int req_gap_to_prev(struct request *req, struct request *next)
+{
+ struct bio *prev = req->biotail;
+
+ return bvec_gap_to_prev(&prev->bi_io_vec[prev->bi_vcnt - 1],
+ next->bio->bi_io_vec[0].bv_offset);
+}
+
static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
struct request *next)
{
@@ -399,6 +378,10 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
if (req_no_special_merge(req) || req_no_special_merge(next))
return 0;
+ if (test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags) &&
+ req_gap_to_prev(req, next))
+ return 0;
+
/*
* Will it become too large?
*/
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 60c9d4a93fe4..d53a764b05ea 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -140,35 +140,39 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
return atomic_read(&hctx->nr_active) < depth;
}
-static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
+static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag,
+ bool nowrap)
{
- int tag, org_last_tag, end;
- bool wrap = last_tag != 0;
+ int tag, org_last_tag = last_tag;
- org_last_tag = last_tag;
- end = bm->depth;
- do {
-restart:
- tag = find_next_zero_bit(&bm->word, end, last_tag);
- if (unlikely(tag >= end)) {
+ while (1) {
+ tag = find_next_zero_bit(&bm->word, bm->depth, last_tag);
+ if (unlikely(tag >= bm->depth)) {
/*
- * We started with an offset, start from 0 to
+ * We started with an offset, and we didn't reset the
+ * offset to 0 in a failure case, so start from 0 to
* exhaust the map.
*/
- if (wrap) {
- wrap = false;
- end = org_last_tag;
- last_tag = 0;
- goto restart;
+ if (org_last_tag && last_tag && !nowrap) {
+ last_tag = org_last_tag = 0;
+ continue;
}
return -1;
}
+
+ if (!test_and_set_bit(tag, &bm->word))
+ break;
+
last_tag = tag + 1;
- } while (test_and_set_bit(tag, &bm->word));
+ if (last_tag >= bm->depth - 1)
+ last_tag = 0;
+ }
return tag;
}
+#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR)
+
/*
* Straight forward bitmap tag implementation, where each bit is a tag
* (cleared == free, and set == busy). The small twist is using per-cpu
@@ -181,7 +185,7 @@ restart:
* until the map is exhausted.
*/
static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
- unsigned int *tag_cache)
+ unsigned int *tag_cache, struct blk_mq_tags *tags)
{
unsigned int last_tag, org_last_tag;
int index, i, tag;
@@ -193,15 +197,24 @@ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
index = TAG_TO_INDEX(bt, last_tag);
for (i = 0; i < bt->map_nr; i++) {
- tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag));
+ tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag),
+ BT_ALLOC_RR(tags));
if (tag != -1) {
tag += (index << bt->bits_per_word);
goto done;
}
- last_tag = 0;
- if (++index >= bt->map_nr)
+ /*
+ * Jump to next index, and reset the last tag to be the
+ * first tag of that index
+ */
+ index++;
+ last_tag = (index << bt->bits_per_word);
+
+ if (index >= bt->map_nr) {
index = 0;
+ last_tag = 0;
+ }
}
*tag_cache = 0;
@@ -212,7 +225,7 @@ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
* up using the specific cached tag.
*/
done:
- if (tag == org_last_tag) {
+ if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) {
last_tag = tag + 1;
if (last_tag >= bt->depth - 1)
last_tag = 0;
@@ -241,13 +254,13 @@ static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
static int bt_get(struct blk_mq_alloc_data *data,
struct blk_mq_bitmap_tags *bt,
struct blk_mq_hw_ctx *hctx,
- unsigned int *last_tag)
+ unsigned int *last_tag, struct blk_mq_tags *tags)
{
struct bt_wait_state *bs;
DEFINE_WAIT(wait);
int tag;
- tag = __bt_get(hctx, bt, last_tag);
+ tag = __bt_get(hctx, bt, last_tag, tags);
if (tag != -1)
return tag;
@@ -258,7 +271,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
do {
prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
- tag = __bt_get(hctx, bt, last_tag);
+ tag = __bt_get(hctx, bt, last_tag, tags);
if (tag != -1)
break;
@@ -273,7 +286,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
* Retry tag allocation after running the hardware queue,
* as running the queue may also have found completions.
*/
- tag = __bt_get(hctx, bt, last_tag);
+ tag = __bt_get(hctx, bt, last_tag, tags);
if (tag != -1)
break;
@@ -304,7 +317,7 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
int tag;
tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
- &data->ctx->last_tag);
+ &data->ctx->last_tag, data->hctx->tags);
if (tag >= 0)
return tag + data->hctx->tags->nr_reserved_tags;
@@ -320,7 +333,8 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
return BLK_MQ_TAG_FAIL;
}
- tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero);
+ tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero,
+ data->hctx->tags);
if (tag < 0)
return BLK_MQ_TAG_FAIL;
@@ -392,7 +406,8 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
BUG_ON(real_tag >= tags->nr_tags);
bt_clear_tag(&tags->bitmap_tags, real_tag);
- *last_tag = real_tag;
+ if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO))
+ *last_tag = real_tag;
} else {
BUG_ON(tag >= tags->nr_reserved_tags);
bt_clear_tag(&tags->breserved_tags, tag);
@@ -509,6 +524,7 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
if (!bt->bs) {
kfree(bt->map);
+ bt->map = NULL;
return -ENOMEM;
}
@@ -529,10 +545,12 @@ static void bt_free(struct blk_mq_bitmap_tags *bt)
}
static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
- int node)
+ int node, int alloc_policy)
{
unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+ tags->alloc_policy = alloc_policy;
+
if (bt_alloc(&tags->bitmap_tags, depth, node, false))
goto enomem;
if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
@@ -546,7 +564,8 @@ enomem:
}
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
- unsigned int reserved_tags, int node)
+ unsigned int reserved_tags,
+ int node, int alloc_policy)
{
struct blk_mq_tags *tags;
@@ -562,7 +581,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
- return blk_mq_init_bitmap_tags(tags, node);
+ return blk_mq_init_bitmap_tags(tags, node, alloc_policy);
}
void blk_mq_free_tags(struct blk_mq_tags *tags)
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index a6fa0fc9d41a..90767b370308 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -42,10 +42,12 @@ struct blk_mq_tags {
struct request **rqs;
struct list_head page_list;
+
+ int alloc_policy;
};
-extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
+extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy);
extern void blk_mq_free_tags(struct blk_mq_tags *tags);
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2390c5541e71..4f4bea21052e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -33,6 +33,7 @@ static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
+static void blk_mq_run_queues(struct request_queue *q);
/*
* Check if any of the ctx's have pending work in this hardware queue
@@ -117,7 +118,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
if (freeze) {
percpu_ref_kill(&q->mq_usage_counter);
- blk_mq_run_queues(q, false);
+ blk_mq_run_queues(q);
}
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
@@ -136,6 +137,7 @@ void blk_mq_freeze_queue(struct request_queue *q)
blk_mq_freeze_queue_start(q);
blk_mq_freeze_queue_wait(q);
}
+EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
void blk_mq_unfreeze_queue(struct request_queue *q)
{
@@ -902,7 +904,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
&hctx->run_work, 0);
}
-void blk_mq_run_queues(struct request_queue *q, bool async)
+static void blk_mq_run_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
@@ -913,10 +915,9 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
test_bit(BLK_MQ_S_STOPPED, &hctx->state))
continue;
- blk_mq_run_hw_queue(hctx, async);
+ blk_mq_run_hw_queue(hctx, false);
}
}
-EXPORT_SYMBOL(blk_mq_run_queues);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
@@ -954,7 +955,6 @@ void blk_mq_start_hw_queues(struct request_queue *q)
}
EXPORT_SYMBOL(blk_mq_start_hw_queues);
-
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx;
@@ -1423,7 +1423,8 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
size_t rq_size, left;
tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
- set->numa_node);
+ set->numa_node,
+ BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
if (!tags)
return NULL;
diff --git a/block/blk-tag.c b/block/blk-tag.c
index a185b86741e5..f0344e6939d5 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -119,7 +119,7 @@ fail:
}
static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
- int depth)
+ int depth, int alloc_policy)
{
struct blk_queue_tag *tags;
@@ -131,6 +131,8 @@ static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
goto fail;
atomic_set(&tags->refcnt, 1);
+ tags->alloc_policy = alloc_policy;
+ tags->next_tag = 0;
return tags;
fail:
kfree(tags);
@@ -140,10 +142,11 @@ fail:
/**
* blk_init_tags - initialize the tag info for an external tag map
* @depth: the maximum queue depth supported
+ * @alloc_policy: tag allocation policy
**/
-struct blk_queue_tag *blk_init_tags(int depth)
+struct blk_queue_tag *blk_init_tags(int depth, int alloc_policy)
{
- return __blk_queue_init_tags(NULL, depth);
+ return __blk_queue_init_tags(NULL, depth, alloc_policy);
}
EXPORT_SYMBOL(blk_init_tags);
@@ -152,19 +155,20 @@ EXPORT_SYMBOL(blk_init_tags);
* @q: the request queue for the device
* @depth: the maximum queue depth supported
* @tags: the tag to use
+ * @alloc_policy: tag allocation policy
*
* Queue lock must be held here if the function is called to resize an
* existing map.
**/
int blk_queue_init_tags(struct request_queue *q, int depth,
- struct blk_queue_tag *tags)
+ struct blk_queue_tag *tags, int alloc_policy)
{
int rc;
BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
if (!tags && !q->queue_tags) {
- tags = __blk_queue_init_tags(q, depth);
+ tags = __blk_queue_init_tags(q, depth, alloc_policy);
if (!tags)
return -ENOMEM;
@@ -344,9 +348,21 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
}
do {
- tag = find_first_zero_bit(bqt->tag_map, max_depth);
- if (tag >= max_depth)
- return 1;
+ if (bqt->alloc_policy == BLK_TAG_ALLOC_FIFO) {
+ tag = find_first_zero_bit(bqt->tag_map, max_depth);
+ if (tag >= max_depth)
+ return 1;
+ } else {
+ int start = bqt->next_tag;
+ int size = min_t(int, bqt->max_depth, max_depth + start);
+ tag = find_next_zero_bit(bqt->tag_map, size, start);
+ if (tag >= size && start + size > bqt->max_depth) {
+ size = start + size - bqt->max_depth;
+ tag = find_first_zero_bit(bqt->tag_map, size);
+ }
+ if (tag >= size)
+ return 1;
+ }
} while (test_and_set_bit_lock(tag, bqt->tag_map));
/*
@@ -354,6 +370,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
* See blk_queue_end_tag for details.
*/
+ bqt->next_tag = (tag + 1) % bqt->max_depth;
rq->cmd_flags |= REQ_QUEUED;
rq->tag = tag;
bqt->tag_index[tag] = rq;
diff --git a/block/bsg.c b/block/bsg.c
index 276e869e686c..d214e929ce18 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -136,42 +136,6 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index)
return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
}
-static int bsg_io_schedule(struct bsg_device *bd)
-{
- DEFINE_WAIT(wait);
- int ret = 0;
-
- spin_lock_irq(&bd->lock);
-
- BUG_ON(bd->done_cmds > bd->queued_cmds);
-
- /*
- * -ENOSPC or -ENODATA? I'm going for -ENODATA, meaning "I have no
- * work to do", even though we return -ENOSPC after this same test
- * during bsg_write() -- there, it means our buffer can't have more
- * bsg_commands added to it, thus has no space left.
- */
- if (bd->done_cmds == bd->queued_cmds) {
- ret = -ENODATA;
- goto unlock;
- }
-
- if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
- ret = -EAGAIN;
- goto unlock;
- }
-
- prepare_to_wait(&bd->wq_done, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&bd->lock);
- io_schedule();
- finish_wait(&bd->wq_done, &wait);
-
- return ret;
-unlock:
- spin_unlock_irq(&bd->lock);
- return ret;
-}
-
static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
struct sg_io_v4 *hdr, struct bsg_device *bd,
fmode_t has_write_perm)
@@ -482,6 +446,30 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
return ret;
}
+static bool bsg_complete(struct bsg_device *bd)
+{
+ bool ret = false;
+ bool spin;
+
+ do {
+ spin_lock_irq(&bd->lock);
+
+ BUG_ON(bd->done_cmds > bd->queued_cmds);
+
+ /*
+ * All commands consumed.
+ */
+ if (bd->done_cmds == bd->queued_cmds)
+ ret = true;
+
+ spin = !test_bit(BSG_F_BLOCK, &bd->flags);
+
+ spin_unlock_irq(&bd->lock);
+ } while (!ret && spin);
+
+ return ret;
+}
+
static int bsg_complete_all_commands(struct bsg_device *bd)
{
struct bsg_command *bc;
@@ -492,17 +480,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)
/*
* wait for all commands to complete
*/
- ret = 0;
- do {
- ret = bsg_io_schedule(bd);
- /*
- * look for -ENODATA specifically -- we'll sometimes get
- * -ERESTARTSYS when we've taken a signal, but we can't
- * return until we're done freeing the queue, so ignore
- * it. The signal will get handled when we're done freeing
- * the bsg_device.
- */
- } while (ret != -ENODATA);
+ io_wait_event(bd->wq_done, bsg_complete(bd));
/*
* discard done commands
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6f2751d305de..5da8e6e9ab4b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3590,6 +3590,11 @@ retry:
blkcg = bio_blkcg(bio);
cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
+ if (!cfqg) {
+ cfqq = &cfqd->oom_cfqq;
+ goto out;
+ }
+
cfqq = cic_to_cfqq(cic, is_sync);
/*
@@ -3626,7 +3631,7 @@ retry:
} else
cfqq = &cfqd->oom_cfqq;
}
-
+out:
if (new_cfqq)
kmem_cache_free(cfq_pool, new_cfqq);
@@ -3656,12 +3661,17 @@ static struct cfq_queue *
cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
struct bio *bio, gfp_t gfp_mask)
{
- const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
- const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
+ int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
+ int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
struct cfq_queue **async_cfqq = NULL;
struct cfq_queue *cfqq = NULL;
if (!is_sync) {
+ if (!ioprio_valid(cic->ioprio)) {
+ struct task_struct *tsk = current;
+ ioprio = task_nice_ioprio(tsk);
+ ioprio_class = task_nice_ioclass(tsk);
+ }
async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
cfqq = *async_cfqq;
}
diff --git a/block/ioctl.c b/block/ioctl.c
index 6c7bf903742f..7d8befde2aca 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -198,7 +198,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
if (start + len > (i_size_read(bdev->bd_inode) >> 9))
return -EINVAL;
- return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL);
+ return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false);
}
static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/block/partitions/check.c b/block/partitions/check.c
index 9ac1df74f699..16118d11dbfc 100644
--- a/block/partitions/check.c
+++ b/block/partitions/check.c
@@ -184,12 +184,12 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
if (err)
/* The partition is unrecognized. So report I/O errors if there were any */
res = err;
- if (!res)
- strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
- else if (warn_no_part)
- strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
-
- printk(KERN_INFO "%s", state->pp_buf);
+ if (res) {
+ if (warn_no_part)
+ strlcat(state->pp_buf,
+ " unable to read partition table\n", PAGE_SIZE);
+ printk(KERN_INFO "%s", state->pp_buf);
+ }
free_page((unsigned long)state->pp_buf);
free_partitions(state);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 28163fad3c5d..e1f71c396193 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -332,7 +332,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
ret = 0;
if (hdr->iovec_count) {
- size_t iov_data_len;
+ struct iov_iter i;
struct iovec *iov = NULL;
ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count,
@@ -342,20 +342,11 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
goto out_free_cdb;
}
- iov_data_len = ret;
- ret = 0;
-
/* SG_IO howto says that the shorter of the two wins */
- if (hdr->dxfer_len < iov_data_len) {
- hdr->iovec_count = iov_shorten(iov,
- hdr->iovec_count,
- hdr->dxfer_len);
- iov_data_len = hdr->dxfer_len;
- }
+ iov_iter_init(&i, rq_data_dir(rq), iov, hdr->iovec_count,
+ min_t(unsigned, ret, hdr->dxfer_len));
- ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov,
- hdr->iovec_count,
- iov_data_len, GFP_KERNEL);
+ ret = blk_rq_map_user_iov(q, rq, NULL, &i, GFP_KERNEL);
kfree(iov);
} else if (hdr->dxfer_len)
ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 4b0d5e71858e..4c35f0822d06 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1585,8 +1585,6 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
else
tag = 0;
- if (test_and_set_bit(tag, &ap->qc_allocated))
- BUG();
qc = __ata_qc_from_tag(ap, tag);
qc->tag = tag;
@@ -4722,69 +4720,36 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
}
/**
- * ata_qc_new - Request an available ATA command, for queueing
- * @ap: target port
- *
- * Some ATA host controllers may implement a queue depth which is less
- * than ATA_MAX_QUEUE. So we shouldn't allocate a tag which is beyond
- * the hardware limitation.
+ * ata_qc_new_init - Request an available ATA command, and initialize it
+ * @dev: Device from whom we request an available command structure
*
* LOCKING:
* None.
*/
-static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
+struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag)
{
- struct ata_queued_cmd *qc = NULL;
- unsigned int max_queue = ap->host->n_tags;
- unsigned int i, tag;
+ struct ata_port *ap = dev->link->ap;
+ struct ata_queued_cmd *qc;
/* no command while frozen */
if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
return NULL;
- for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) {
- if (ap->flags & ATA_FLAG_LOWTAG)
- tag = i;
- else
- tag = tag < max_queue ? tag : 0;
-
- /* the last tag is reserved for internal command. */
- if (tag == ATA_TAG_INTERNAL)
- continue;
-
- if (!test_and_set_bit(tag, &ap->qc_allocated)) {
- qc = __ata_qc_from_tag(ap, tag);
- qc->tag = tag;
- ap->last_tag = tag;
- break;
- }
+ /* libsas case */
+ if (!ap->scsi_host) {
+ tag = ata_sas_allocate_tag(ap);
+ if (tag < 0)
+ return NULL;
}
- return qc;
-}
-
-/**
- * ata_qc_new_init - Request an available ATA command, and initialize it
- * @dev: Device from whom we request an available command structure
- *
- * LOCKING:
- * None.
- */
-
-struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
-{
- struct ata_port *ap = dev->link->ap;
- struct ata_queued_cmd *qc;
-
- qc = ata_qc_new(ap);
- if (qc) {
- qc->scsicmd = NULL;
- qc->ap = ap;
- qc->dev = dev;
+ qc = __ata_qc_from_tag(ap, tag);
+ qc->tag = tag;
+ qc->scsicmd = NULL;
+ qc->ap = ap;
+ qc->dev = dev;
- ata_qc_reinit(qc);
- }
+ ata_qc_reinit(qc);
return qc;
}
@@ -4811,7 +4776,8 @@ void ata_qc_free(struct ata_queued_cmd *qc)
tag = qc->tag;
if (likely(ata_tag_valid(tag))) {
qc->tag = ATA_TAG_POISON;
- clear_bit(tag, &ap->qc_allocated);
+ if (!ap->scsi_host)
+ ata_sas_free_tag(tag, ap);
}
}
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 280729325ebd..b061ba2c31d8 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -756,7 +756,7 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
{
struct ata_queued_cmd *qc;
- qc = ata_qc_new_init(dev);
+ qc = ata_qc_new_init(dev, cmd->request->tag);
if (qc) {
qc->scsicmd = cmd;
qc->scsidone = cmd->scsi_done;
@@ -3668,6 +3668,9 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht)
*/
shost->max_host_blocked = 1;
+ if (scsi_init_shared_tag_map(shost, host->n_tags))
+ goto err_add;
+
rc = scsi_add_host_with_dma(ap->scsi_host,
&ap->tdev, ap->host->dev);
if (rc)
@@ -4230,3 +4233,31 @@ int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap)
return rc;
}
EXPORT_SYMBOL_GPL(ata_sas_queuecmd);
+
+int ata_sas_allocate_tag(struct ata_port *ap)
+{
+ unsigned int max_queue = ap->host->n_tags;
+ unsigned int i, tag;
+
+ for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) {
+ if (ap->flags & ATA_FLAG_LOWTAG)
+ tag = 1;
+ else
+ tag = tag < max_queue ? tag : 0;
+
+ /* the last tag is reserved for internal command. */
+ if (tag == ATA_TAG_INTERNAL)
+ continue;
+
+ if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) {
+ ap->sas_last_tag = tag;
+ return tag;
+ }
+ }
+ return -1;
+}
+
+void ata_sas_free_tag(unsigned int tag, struct ata_port *ap)
+{
+ clear_bit(tag, &ap->sas_tag_allocated);
+}
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 82ebe263d2f1..f840ca18a7c0 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -63,7 +63,7 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev);
extern void ata_force_cbl(struct ata_port *ap);
extern u64 ata_tf_to_lba(const struct ata_taskfile *tf);
extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf);
-extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev);
+extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag);
extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
u64 block, u32 n_block, unsigned int tf_flags,
unsigned int tag);
@@ -144,6 +144,8 @@ extern void ata_scsi_dev_rescan(struct work_struct *work);
extern int ata_bus_probe(struct ata_port *ap);
extern int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
unsigned int id, u64 lun);
+int ata_sas_allocate_tag(struct ata_port *ap);
+void ata_sas_free_tag(unsigned int tag, struct ata_port *ap);
/* libata-eh.c */
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index ea655949023f..ba2667fa0528 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -388,6 +388,7 @@ static struct scsi_host_template sil24_sht = {
.can_queue = SIL24_MAX_CMDS,
.sg_tablesize = SIL24_MAX_SGE,
.dma_boundary = ATA_DMA_BOUNDARY,
+ .tag_alloc_policy = BLK_TAG_ALLOC_FIFO,
};
static struct ata_port_operations sil24_ops = {
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 3598110d2cef..c01b921b1b4a 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -370,25 +370,25 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
}
#ifdef CONFIG_BLK_DEV_XIP
-static int brd_direct_access(struct block_device *bdev, sector_t sector,
- void **kaddr, unsigned long *pfn)
+static long brd_direct_access(struct block_device *bdev, sector_t sector,
+ void **kaddr, unsigned long *pfn, long size)
{
struct brd_device *brd = bdev->bd_disk->private_data;
struct page *page;
if (!brd)
return -ENODEV;
- if (sector & (PAGE_SECTORS-1))
- return -EINVAL;
- if (sector + PAGE_SECTORS > get_capacity(bdev->bd_disk))
- return -ERANGE;
page = brd_insert_page(brd, sector);
if (!page)
return -ENOSPC;
*kaddr = page_address(page);
*pfn = page_to_pfn(page);
- return 0;
+ /*
+ * TODO: If size > PAGE_SIZE, we could look to see if the next page in
+ * the file happens to be mapped to the next page of physical RAM.
+ */
+ return PAGE_SIZE;
}
#endif
@@ -438,19 +438,18 @@ static const struct block_device_operations brd_fops = {
/*
* And now the modules code and kernel interface.
*/
-static int rd_nr;
-int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
-static int max_part;
-static int part_shift;
-static int part_show = 0;
+static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
module_param(rd_nr, int, S_IRUGO);
MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
+
+int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
module_param(rd_size, int, S_IRUGO);
MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
+
+static int max_part = 1;
module_param(max_part, int, S_IRUGO);
-MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk");
-module_param(part_show, int, S_IRUGO);
-MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions");
+MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
+
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
MODULE_ALIAS("rd");
@@ -487,25 +486,33 @@ static struct brd_device *brd_alloc(int i)
brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
if (!brd->brd_queue)
goto out_free_dev;
+
blk_queue_make_request(brd->brd_queue, brd_make_request);
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
+ /* This is so fdisk will align partitions on 4k, because of
+ * direct_access API needing 4k alignment, returning a PFN
+ * (This is only a problem on very small devices <= 4M,
+ * otherwise fdisk will align on 1M. Regardless this call
+ * is harmless)
+ */
+ blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
+
brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
brd->brd_queue->limits.discard_zeroes_data = 1;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
- disk = brd->brd_disk = alloc_disk(1 << part_shift);
+ disk = brd->brd_disk = alloc_disk(max_part);
if (!disk)
goto out_free_queue;
disk->major = RAMDISK_MAJOR;
- disk->first_minor = i << part_shift;
+ disk->first_minor = i * max_part;
disk->fops = &brd_fops;
disk->private_data = brd;
disk->queue = brd->brd_queue;
- if (!part_show)
- disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
+ disk->flags = GENHD_FL_EXT_DEVT;
sprintf(disk->disk_name, "ram%d", i);
set_capacity(disk, rd_size * 2);
@@ -527,10 +534,11 @@ static void brd_free(struct brd_device *brd)
kfree(brd);
}
-static struct brd_device *brd_init_one(int i)
+static struct brd_device *brd_init_one(int i, bool *new)
{
struct brd_device *brd;
+ *new = false;
list_for_each_entry(brd, &brd_devices, brd_list) {
if (brd->brd_number == i)
goto out;
@@ -541,6 +549,7 @@ static struct brd_device *brd_init_one(int i)
add_disk(brd->brd_disk);
list_add_tail(&brd->brd_list, &brd_devices);
}
+ *new = true;
out:
return brd;
}
@@ -556,70 +565,46 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data)
{
struct brd_device *brd;
struct kobject *kobj;
+ bool new;
mutex_lock(&brd_devices_mutex);
- brd = brd_init_one(MINOR(dev) >> part_shift);
+ brd = brd_init_one(MINOR(dev) / max_part, &new);
kobj = brd ? get_disk(brd->brd_disk) : NULL;
mutex_unlock(&brd_devices_mutex);
- *part = 0;
+ if (new)
+ *part = 0;
+
return kobj;
}
static int __init brd_init(void)
{
- int i, nr;
- unsigned long range;
struct brd_device *brd, *next;
+ int i;
/*
* brd module now has a feature to instantiate underlying device
* structure on-demand, provided that there is an access dev node.
- * However, this will not work well with user space tool that doesn't
- * know about such "feature". In order to not break any existing
- * tool, we do the following:
*
- * (1) if rd_nr is specified, create that many upfront, and this
- * also becomes a hard limit.
- * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT
- * (default 16) rd device on module load, user can further
- * extend brd device by create dev node themselves and have
- * kernel automatically instantiate actual device on-demand.
+ * (1) if rd_nr is specified, create that many upfront. else
+ * it defaults to CONFIG_BLK_DEV_RAM_COUNT
+ * (2) User can further extend brd devices by create dev node themselves
+ * and have kernel automatically instantiate actual device
+ * on-demand. Example:
+ * mknod /path/devnod_name b 1 X # 1 is the rd major
+ * fdisk -l /path/devnod_name
+ * If (X / max_part) was not already created it will be created
+ * dynamically.
*/
- part_shift = 0;
- if (max_part > 0) {
- part_shift = fls(max_part);
-
- /*
- * Adjust max_part according to part_shift as it is exported
- * to user space so that user can decide correct minor number
- * if [s]he want to create more devices.
- *
- * Note that -1 is required because partition 0 is reserved
- * for the whole disk.
- */
- max_part = (1UL << part_shift) - 1;
- }
-
- if ((1UL << part_shift) > DISK_MAX_PARTS)
- return -EINVAL;
-
- if (rd_nr > 1UL << (MINORBITS - part_shift))
- return -EINVAL;
-
- if (rd_nr) {
- nr = rd_nr;
- range = rd_nr << part_shift;
- } else {
- nr = CONFIG_BLK_DEV_RAM_COUNT;
- range = 1UL << MINORBITS;
- }
-
if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
return -EIO;
- for (i = 0; i < nr; i++) {
+ if (unlikely(!max_part))
+ max_part = 1;
+
+ for (i = 0; i < rd_nr; i++) {
brd = brd_alloc(i);
if (!brd)
goto out_free;
@@ -631,10 +616,10 @@ static int __init brd_init(void)
list_for_each_entry(brd, &brd_devices, brd_list)
add_disk(brd->brd_disk);
- blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
+ blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
THIS_MODULE, brd_probe, NULL, NULL);
- printk(KERN_INFO "brd: module loaded\n");
+ pr_info("brd: module loaded\n");
return 0;
out_free:
@@ -644,21 +629,21 @@ out_free:
}
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+ pr_info("brd: module NOT loaded !!!\n");
return -ENOMEM;
}
static void __exit brd_exit(void)
{
- unsigned long range;
struct brd_device *brd, *next;
- range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS;
-
list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
brd_del_one(brd);
- blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
+ blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+
+ pr_info("brd: module unloaded\n");
}
module_init(brd_init);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index d169b4a79267..cee20354ac37 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1388,7 +1388,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
list_add_tail(&peer_req->w.list, &device->active_ee);
spin_unlock_irq(&device->resource->req_lock);
if (blkdev_issue_zeroout(device->ldev->backing_bdev,
- sector, data_size >> 9, GFP_NOIO))
+ sector, data_size >> 9, GFP_NOIO, false))
peer_req->flags |= EE_WAS_ERROR;
drbd_endio_write_sec_final(peer_req);
return 0;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 56d46ffb08e1..a08cda955285 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4112,6 +4112,13 @@ static ssize_t floppy_cmos_show(struct device *dev,
static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
+static struct attribute *floppy_dev_attrs[] = {
+ &dev_attr_cmos.attr,
+ NULL
+};
+
+ATTRIBUTE_GROUPS(floppy_dev);
+
static void floppy_device_release(struct device *dev)
{
}
@@ -4324,16 +4331,12 @@ static int __init do_floppy_init(void)
floppy_device[drive].name = floppy_device_name;
floppy_device[drive].id = drive;
floppy_device[drive].dev.release = floppy_device_release;
+ floppy_device[drive].dev.groups = floppy_dev_groups;
err = platform_device_register(&floppy_device[drive]);
if (err)
goto out_remove_drives;
- err = device_create_file(&floppy_device[drive].dev,
- &dev_attr_cmos);
- if (err)
- goto out_unreg_platform_dev;
-
/* to be cleaned up... */
disks[drive]->private_data = (void *)(long)drive;
disks[drive]->flags |= GENHD_FL_REMOVABLE;
@@ -4343,13 +4346,10 @@ static int __init do_floppy_init(void)
return 0;
-out_unreg_platform_dev:
- platform_device_unregister(&floppy_device[drive]);
out_remove_drives:
while (drive--) {
if (floppy_available(drive)) {
del_gendisk(disks[drive]);
- device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
platform_device_unregister(&floppy_device[drive]);
}
}
@@ -4594,7 +4594,6 @@ static void __exit floppy_module_exit(void)
if (floppy_available(drive)) {
del_gendisk(disks[drive]);
- device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
platform_device_unregister(&floppy_device[drive]);
}
blk_cleanup_queue(disks[drive]->queue);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6cb1beb47c25..d1f168b73634 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -85,6 +85,8 @@ static DEFINE_MUTEX(loop_index_mutex);
static int max_part;
static int part_shift;
+static struct workqueue_struct *loop_wq;
+
/*
* Transfer functions
*/
@@ -284,12 +286,12 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
return ret;
}
-static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
+static int lo_send(struct loop_device *lo, struct request *rq, loff_t pos)
{
int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t,
struct page *page);
struct bio_vec bvec;
- struct bvec_iter iter;
+ struct req_iterator iter;
struct page *page = NULL;
int ret = 0;
@@ -303,7 +305,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
do_lo_send = do_lo_send_direct_write;
}
- bio_for_each_segment(bvec, bio, iter) {
+ rq_for_each_segment(bvec, rq, iter) {
ret = do_lo_send(lo, &bvec, pos, page);
if (ret < 0)
break;
@@ -391,19 +393,22 @@ do_lo_receive(struct loop_device *lo,
}
static int
-lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
+lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos)
{
struct bio_vec bvec;
- struct bvec_iter iter;
+ struct req_iterator iter;
ssize_t s;
- bio_for_each_segment(bvec, bio, iter) {
+ rq_for_each_segment(bvec, rq, iter) {
s = do_lo_receive(lo, &bvec, bsize, pos);
if (s < 0)
return s;
if (s != bvec.bv_len) {
- zero_fill_bio(bio);
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq)
+ zero_fill_bio(bio);
break;
}
pos += bvec.bv_len;
@@ -411,106 +416,58 @@ lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
return 0;
}
-static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
+static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
{
- loff_t pos;
+ /*
+ * We use punch hole to reclaim the free space used by the
+ * image a.k.a. discard. However we do not support discard if
+ * encryption is enabled, because it may give an attacker
+ * useful information.
+ */
+ struct file *file = lo->lo_backing_file;
+ int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
int ret;
- pos = ((loff_t) bio->bi_iter.bi_sector << 9) + lo->lo_offset;
-
- if (bio_rw(bio) == WRITE) {
- struct file *file = lo->lo_backing_file;
-
- if (bio->bi_rw & REQ_FLUSH) {
- ret = vfs_fsync(file, 0);
- if (unlikely(ret && ret != -EINVAL)) {
- ret = -EIO;
- goto out;
- }
- }
-
- /*
- * We use punch hole to reclaim the free space used by the
- * image a.k.a. discard. However we do not support discard if
- * encryption is enabled, because it may give an attacker
- * useful information.
- */
- if (bio->bi_rw & REQ_DISCARD) {
- struct file *file = lo->lo_backing_file;
- int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
-
- if ((!file->f_op->fallocate) ||
- lo->lo_encrypt_key_size) {
- ret = -EOPNOTSUPP;
- goto out;
- }
- ret = file->f_op->fallocate(file, mode, pos,
- bio->bi_iter.bi_size);
- if (unlikely(ret && ret != -EINVAL &&
- ret != -EOPNOTSUPP))
- ret = -EIO;
- goto out;
- }
-
- ret = lo_send(lo, bio, pos);
-
- if ((bio->bi_rw & REQ_FUA) && !ret) {
- ret = vfs_fsync(file, 0);
- if (unlikely(ret && ret != -EINVAL))
- ret = -EIO;
- }
- } else
- ret = lo_receive(lo, bio, lo->lo_blocksize, pos);
+ if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
-out:
+ ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
+ if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
+ ret = -EIO;
+ out:
return ret;
}
-/*
- * Add bio to back of pending list
- */
-static void loop_add_bio(struct loop_device *lo, struct bio *bio)
+static int lo_req_flush(struct loop_device *lo, struct request *rq)
{
- lo->lo_bio_count++;
- bio_list_add(&lo->lo_bio_list, bio);
-}
+ struct file *file = lo->lo_backing_file;
+ int ret = vfs_fsync(file, 0);
+ if (unlikely(ret && ret != -EINVAL))
+ ret = -EIO;
-/*
- * Grab first pending buffer
- */
-static struct bio *loop_get_bio(struct loop_device *lo)
-{
- lo->lo_bio_count--;
- return bio_list_pop(&lo->lo_bio_list);
+ return ret;
}
-static void loop_make_request(struct request_queue *q, struct bio *old_bio)
+static int do_req_filebacked(struct loop_device *lo, struct request *rq)
{
- struct loop_device *lo = q->queuedata;
- int rw = bio_rw(old_bio);
-
- if (rw == READA)
- rw = READ;
+ loff_t pos;
+ int ret;
- BUG_ON(!lo || (rw != READ && rw != WRITE));
+ pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
- spin_lock_irq(&lo->lo_lock);
- if (lo->lo_state != Lo_bound)
- goto out;
- if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
- goto out;
- if (lo->lo_bio_count >= q->nr_congestion_on)
- wait_event_lock_irq(lo->lo_req_wait,
- lo->lo_bio_count < q->nr_congestion_off,
- lo->lo_lock);
- loop_add_bio(lo, old_bio);
- wake_up(&lo->lo_event);
- spin_unlock_irq(&lo->lo_lock);
- return;
+ if (rq->cmd_flags & REQ_WRITE) {
+ if (rq->cmd_flags & REQ_FLUSH)
+ ret = lo_req_flush(lo, rq);
+ else if (rq->cmd_flags & REQ_DISCARD)
+ ret = lo_discard(lo, rq, pos);
+ else
+ ret = lo_send(lo, rq, pos);
+ } else
+ ret = lo_receive(lo, rq, lo->lo_blocksize, pos);
-out:
- spin_unlock_irq(&lo->lo_lock);
- bio_io_error(old_bio);
+ return ret;
}
struct switch_request {
@@ -518,57 +475,26 @@ struct switch_request {
struct completion wait;
};
-static void do_loop_switch(struct loop_device *, struct switch_request *);
-
-static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
-{
- if (unlikely(!bio->bi_bdev)) {
- do_loop_switch(lo, bio->bi_private);
- bio_put(bio);
- } else {
- int ret = do_bio_filebacked(lo, bio);
- bio_endio(bio, ret);
- }
-}
-
/*
- * worker thread that handles reads/writes to file backed loop devices,
- * to avoid blocking in our make_request_fn. it also does loop decrypting
- * on reads for block backed loop, as that is too heavy to do from
- * b_end_io context where irqs may be disabled.
- *
- * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before
- * calling kthread_stop(). Therefore once kthread_should_stop() is
- * true, make_request will not place any more requests. Therefore
- * once kthread_should_stop() is true and lo_bio is NULL, we are
- * done with the loop.
+ * Do the actual switch; called from the BIO completion routine
*/
-static int loop_thread(void *data)
+static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
{
- struct loop_device *lo = data;
- struct bio *bio;
-
- set_user_nice(current, MIN_NICE);
-
- while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
-
- wait_event_interruptible(lo->lo_event,
- !bio_list_empty(&lo->lo_bio_list) ||
- kthread_should_stop());
-
- if (bio_list_empty(&lo->lo_bio_list))
- continue;
- spin_lock_irq(&lo->lo_lock);
- bio = loop_get_bio(lo);
- if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off)
- wake_up(&lo->lo_req_wait);
- spin_unlock_irq(&lo->lo_lock);
+ struct file *file = p->file;
+ struct file *old_file = lo->lo_backing_file;
+ struct address_space *mapping;
- BUG_ON(!bio);
- loop_handle_bio(lo, bio);
- }
+ /* if no new file, only flush of queued bios requested */
+ if (!file)
+ return;
- return 0;
+ mapping = file->f_mapping;
+ mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
+ lo->lo_backing_file = file;
+ lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
+ mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
+ lo->old_gfp_mask = mapping_gfp_mask(mapping);
+ mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
}
/*
@@ -579,15 +505,18 @@ static int loop_thread(void *data)
static int loop_switch(struct loop_device *lo, struct file *file)
{
struct switch_request w;
- struct bio *bio = bio_alloc(GFP_KERNEL, 0);
- if (!bio)
- return -ENOMEM;
- init_completion(&w.wait);
+
w.file = file;
- bio->bi_private = &w;
- bio->bi_bdev = NULL;
- loop_make_request(lo->lo_queue, bio);
- wait_for_completion(&w.wait);
+
+ /* freeze queue and wait for completion of scheduled requests */
+ blk_mq_freeze_queue(lo->lo_queue);
+
+ /* do the switch action */
+ do_loop_switch(lo, &w);
+
+ /* unfreeze */
+ blk_mq_unfreeze_queue(lo->lo_queue);
+
return 0;
}
@@ -596,39 +525,10 @@ static int loop_switch(struct loop_device *lo, struct file *file)
*/
static int loop_flush(struct loop_device *lo)
{
- /* loop not yet configured, no running thread, nothing to flush */
- if (!lo->lo_thread)
- return 0;
-
return loop_switch(lo, NULL);
}
/*
- * Do the actual switch; called from the BIO completion routine
- */
-static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
-{
- struct file *file = p->file;
- struct file *old_file = lo->lo_backing_file;
- struct address_space *mapping;
-
- /* if no new file, only flush of queued bios requested */
- if (!file)
- goto out;
-
- mapping = file->f_mapping;
- mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
- lo->lo_backing_file = file;
- lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
- mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
- lo->old_gfp_mask = mapping_gfp_mask(mapping);
- mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
-out:
- complete(&p->wait);
-}
-
-
-/*
* loop_change_fd switched the backing store of a loopback device to
* a new file. This is useful for operating system installers to free up
* the original file and in High Availability environments to switch to
@@ -889,12 +789,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
lo->transfer = transfer_none;
lo->ioctl = NULL;
lo->lo_sizelimit = 0;
- lo->lo_bio_count = 0;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
- bio_list_init(&lo->lo_bio_list);
-
if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
blk_queue_flush(lo->lo_queue, REQ_FLUSH);
@@ -906,14 +803,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
set_blocksize(bdev, lo_blocksize);
- lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
- lo->lo_number);
- if (IS_ERR(lo->lo_thread)) {
- error = PTR_ERR(lo->lo_thread);
- goto out_clr;
- }
lo->lo_state = Lo_bound;
- wake_up_process(lo->lo_thread);
if (part_shift)
lo->lo_flags |= LO_FLAGS_PARTSCAN;
if (lo->lo_flags & LO_FLAGS_PARTSCAN)
@@ -925,18 +815,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
bdgrab(bdev);
return 0;
-out_clr:
- loop_sysfs_exit(lo);
- lo->lo_thread = NULL;
- lo->lo_device = NULL;
- lo->lo_backing_file = NULL;
- lo->lo_flags = 0;
- set_capacity(lo->lo_disk, 0);
- invalidate_bdev(bdev);
- bd_set_size(bdev, 0);
- kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
- mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
- lo->lo_state = Lo_unbound;
out_putf:
fput(file);
out:
@@ -1012,11 +890,6 @@ static int loop_clr_fd(struct loop_device *lo)
spin_lock_irq(&lo->lo_lock);
lo->lo_state = Lo_rundown;
- spin_unlock_irq(&lo->lo_lock);
-
- kthread_stop(lo->lo_thread);
-
- spin_lock_irq(&lo->lo_lock);
lo->lo_backing_file = NULL;
spin_unlock_irq(&lo->lo_lock);
@@ -1028,7 +901,6 @@ static int loop_clr_fd(struct loop_device *lo)
lo->lo_offset = 0;
lo->lo_sizelimit = 0;
lo->lo_encrypt_key_size = 0;
- lo->lo_thread = NULL;
memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
@@ -1601,6 +1473,105 @@ int loop_unregister_transfer(int number)
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);
+static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+
+ blk_mq_start_request(bd->rq);
+
+ if (cmd->rq->cmd_flags & REQ_WRITE) {
+ struct loop_device *lo = cmd->rq->q->queuedata;
+ bool need_sched = true;
+
+ spin_lock_irq(&lo->lo_lock);
+ if (lo->write_started)
+ need_sched = false;
+ else
+ lo->write_started = true;
+ list_add_tail(&cmd->list, &lo->write_cmd_head);
+ spin_unlock_irq(&lo->lo_lock);
+
+ if (need_sched)
+ queue_work(loop_wq, &lo->write_work);
+ } else {
+ queue_work(loop_wq, &cmd->read_work);
+ }
+
+ return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static void loop_handle_cmd(struct loop_cmd *cmd)
+{
+ const bool write = cmd->rq->cmd_flags & REQ_WRITE;
+ struct loop_device *lo = cmd->rq->q->queuedata;
+ int ret = -EIO;
+
+ if (lo->lo_state != Lo_bound)
+ goto failed;
+
+ if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
+ goto failed;
+
+ ret = do_req_filebacked(lo, cmd->rq);
+
+ failed:
+ if (ret)
+ cmd->rq->errors = -EIO;
+ blk_mq_complete_request(cmd->rq);
+}
+
+static void loop_queue_write_work(struct work_struct *work)
+{
+ struct loop_device *lo =
+ container_of(work, struct loop_device, write_work);
+ LIST_HEAD(cmd_list);
+
+ spin_lock_irq(&lo->lo_lock);
+ repeat:
+ list_splice_init(&lo->write_cmd_head, &cmd_list);
+ spin_unlock_irq(&lo->lo_lock);
+
+ while (!list_empty(&cmd_list)) {
+ struct loop_cmd *cmd = list_first_entry(&cmd_list,
+ struct loop_cmd, list);
+ list_del_init(&cmd->list);
+ loop_handle_cmd(cmd);
+ }
+
+ spin_lock_irq(&lo->lo_lock);
+ if (!list_empty(&lo->write_cmd_head))
+ goto repeat;
+ lo->write_started = false;
+ spin_unlock_irq(&lo->lo_lock);
+}
+
+static void loop_queue_read_work(struct work_struct *work)
+{
+ struct loop_cmd *cmd =
+ container_of(work, struct loop_cmd, read_work);
+
+ loop_handle_cmd(cmd);
+}
+
+static int loop_init_request(void *data, struct request *rq,
+ unsigned int hctx_idx, unsigned int request_idx,
+ unsigned int numa_node)
+{
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+ cmd->rq = rq;
+ INIT_WORK(&cmd->read_work, loop_queue_read_work);
+
+ return 0;
+}
+
+static struct blk_mq_ops loop_mq_ops = {
+ .queue_rq = loop_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_request = loop_init_request,
+};
+
static int loop_add(struct loop_device **l, int i)
{
struct loop_device *lo;
@@ -1627,16 +1598,28 @@ static int loop_add(struct loop_device **l, int i)
i = err;
err = -ENOMEM;
- lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
- if (!lo->lo_queue)
+ lo->tag_set.ops = &loop_mq_ops;
+ lo->tag_set.nr_hw_queues = 1;
+ lo->tag_set.queue_depth = 128;
+ lo->tag_set.numa_node = NUMA_NO_NODE;
+ lo->tag_set.cmd_size = sizeof(struct loop_cmd);
+ lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ lo->tag_set.driver_data = lo;
+
+ err = blk_mq_alloc_tag_set(&lo->tag_set);
+ if (err)
goto out_free_idr;
- /*
- * set queue make_request_fn
- */
- blk_queue_make_request(lo->lo_queue, loop_make_request);
+ lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
+ if (IS_ERR_OR_NULL(lo->lo_queue)) {
+ err = PTR_ERR(lo->lo_queue);
+ goto out_cleanup_tags;
+ }
lo->lo_queue->queuedata = lo;
+ INIT_LIST_HEAD(&lo->write_cmd_head);
+ INIT_WORK(&lo->write_work, loop_queue_write_work);
+
disk = lo->lo_disk = alloc_disk(1 << part_shift);
if (!disk)
goto out_free_queue;
@@ -1664,9 +1647,6 @@ static int loop_add(struct loop_device **l, int i)
disk->flags |= GENHD_FL_EXT_DEVT;
mutex_init(&lo->lo_ctl_mutex);
lo->lo_number = i;
- lo->lo_thread = NULL;
- init_waitqueue_head(&lo->lo_event);
- init_waitqueue_head(&lo->lo_req_wait);
spin_lock_init(&lo->lo_lock);
disk->major = LOOP_MAJOR;
disk->first_minor = i << part_shift;
@@ -1680,6 +1660,8 @@ static int loop_add(struct loop_device **l, int i)
out_free_queue:
blk_cleanup_queue(lo->lo_queue);
+out_cleanup_tags:
+ blk_mq_free_tag_set(&lo->tag_set);
out_free_idr:
idr_remove(&loop_index_idr, i);
out_free_dev:
@@ -1692,6 +1674,7 @@ static void loop_remove(struct loop_device *lo)
{
del_gendisk(lo->lo_disk);
blk_cleanup_queue(lo->lo_queue);
+ blk_mq_free_tag_set(&lo->tag_set);
put_disk(lo->lo_disk);
kfree(lo);
}
@@ -1875,6 +1858,13 @@ static int __init loop_init(void)
goto misc_out;
}
+ loop_wq = alloc_workqueue("kloopd",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0);
+ if (!loop_wq) {
+ err = -ENOMEM;
+ goto misc_out;
+ }
+
blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
THIS_MODULE, loop_probe, NULL, NULL);
@@ -1912,6 +1902,8 @@ static void __exit loop_exit(void)
blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
unregister_blkdev(LOOP_MAJOR, "loop");
+ destroy_workqueue(loop_wq);
+
misc_deregister(&loop_misc);
}
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 90df5d6485b6..301c27f8323f 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -11,8 +11,10 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
+#include <linux/workqueue.h>
#include <uapi/linux/loop.h>
/* Possible states of device */
@@ -52,19 +54,23 @@ struct loop_device {
gfp_t old_gfp_mask;
spinlock_t lo_lock;
- struct bio_list lo_bio_list;
- unsigned int lo_bio_count;
+ struct list_head write_cmd_head;
+ struct work_struct write_work;
+ bool write_started;
int lo_state;
struct mutex lo_ctl_mutex;
- struct task_struct *lo_thread;
- wait_queue_head_t lo_event;
- /* wait queue for incoming requests */
- wait_queue_head_t lo_req_wait;
struct request_queue *lo_queue;
+ struct blk_mq_tag_set tag_set;
struct gendisk *lo_disk;
};
+struct loop_cmd {
+ struct work_struct read_work;
+ struct request *rq;
+ struct list_head list;
+};
+
/* Support for loadable transfer modules */
struct loop_func_table {
int number; /* filter type */
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index aa2224aa7caa..65cd61a4145e 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -579,7 +579,7 @@ static int null_add_dev(void)
sector_div(size, bs);
set_capacity(disk, size);
- disk->flags |= GENHD_FL_EXT_DEVT;
+ disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->major = null_major;
disk->first_minor = nullb->index;
disk->fops = &null_fops;
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index d826bf3e62c8..cbdfbbf98392 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -144,8 +144,37 @@ struct nvme_cmd_info {
void *ctx;
int aborted;
struct nvme_queue *nvmeq;
+ struct nvme_iod iod[0];
};
+/*
+ * Max size of iod being embedded in the request payload
+ */
+#define NVME_INT_PAGES 2
+#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size)
+
+/*
+ * Will slightly overestimate the number of pages needed. This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size, struct nvme_dev *dev)
+{
+ unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
+ return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+{
+ unsigned int ret = sizeof(struct nvme_cmd_info);
+
+ ret += sizeof(struct nvme_iod);
+ ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
+ ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
+
+ return ret;
+}
+
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
@@ -218,6 +247,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
}
+static void *iod_get_private(struct nvme_iod *iod)
+{
+ return (void *) (iod->private & ~0x1UL);
+}
+
+/*
+ * If bit 0 is set, the iod is embedded in the request payload.
+ */
+static bool iod_should_kfree(struct nvme_iod *iod)
+{
+ return (iod->private & 0x01) == 0;
+}
+
/* Special values must be less than 0x1000 */
#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
@@ -361,35 +403,53 @@ static __le64 **iod_list(struct nvme_iod *iod)
return ((void *)iod) + iod->offset;
}
-/*
- * Will slightly overestimate the number of pages needed. This is OK
- * as it only leads to a small amount of wasted memory for the lifetime of
- * the I/O.
- */
-static int nvme_npages(unsigned size, struct nvme_dev *dev)
+static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
+ unsigned nseg, unsigned long private)
{
- unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
- return DIV_ROUND_UP(8 * nprps, dev->page_size - 8);
+ iod->private = private;
+ iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+ iod->npages = -1;
+ iod->length = nbytes;
+ iod->nents = 0;
}
static struct nvme_iod *
-nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
+__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
+ unsigned long priv, gfp_t gfp)
{
struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
- sizeof(__le64 *) * nvme_npages(nbytes, dev) +
+ sizeof(__le64 *) * nvme_npages(bytes, dev) +
sizeof(struct scatterlist) * nseg, gfp);
- if (iod) {
- iod->offset = offsetof(struct nvme_iod, sg[nseg]);
- iod->npages = -1;
- iod->length = nbytes;
- iod->nents = 0;
- iod->first_dma = 0ULL;
- }
+ if (iod)
+ iod_init(iod, bytes, nseg, priv);
return iod;
}
+static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
+ gfp_t gfp)
+{
+ unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
+ sizeof(struct nvme_dsm_range);
+ unsigned long mask = 0;
+ struct nvme_iod *iod;
+
+ if (rq->nr_phys_segments <= NVME_INT_PAGES &&
+ size <= NVME_INT_BYTES(dev)) {
+ struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
+
+ iod = cmd->iod;
+ mask = 0x01;
+ iod_init(iod, size, rq->nr_phys_segments,
+ (unsigned long) rq | 0x01);
+ return iod;
+ }
+
+ return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
+ (unsigned long) rq, gfp);
+}
+
void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
{
const int last_prp = dev->page_size / 8 - 1;
@@ -405,7 +465,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
prp_dma = next_prp_dma;
}
- kfree(iod);
+
+ if (iod_should_kfree(iod))
+ kfree(iod);
}
static int nvme_error_status(u16 status)
@@ -424,7 +486,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_completion *cqe)
{
struct nvme_iod *iod = ctx;
- struct request *req = iod->private;
+ struct request *req = iod_get_private(iod);
struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
u16 status = le16_to_cpup(&cqe->status) >> 1;
@@ -585,7 +647,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
struct nvme_ns *ns)
{
- struct request *req = iod->private;
+ struct request *req = iod_get_private(iod);
struct nvme_command *cmnd;
u16 control = 0;
u32 dsmgmt = 0;
@@ -626,17 +688,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
struct request *req = bd->rq;
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
struct nvme_iod *iod;
- int psegs = req->nr_phys_segments;
enum dma_data_direction dma_dir;
- unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
- sizeof(struct nvme_dsm_range);
- iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
+ iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
if (!iod)
return BLK_MQ_RQ_QUEUE_BUSY;
- iod->private = req;
-
if (req->cmd_flags & REQ_DISCARD) {
void *range;
/*
@@ -651,10 +708,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
goto retry_cmd;
iod_list(iod)[0] = (__le64 *)range;
iod->npages = 0;
- } else if (psegs) {
+ } else if (req->nr_phys_segments) {
dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
- sg_init_table(iod->sg, psegs);
+ sg_init_table(iod->sg, req->nr_phys_segments);
iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
if (!iod->nents)
goto error_cmd;
@@ -1137,21 +1194,14 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
{
- LLIST_HEAD(q_list);
- struct nvme_queue *nvmeq, *next;
- struct llist_node *entry;
int i;
for (i = dev->queue_count - 1; i >= lowest; i--) {
struct nvme_queue *nvmeq = dev->queues[i];
- llist_add(&nvmeq->node, &q_list);
dev->queue_count--;
dev->queues[i] = NULL;
- }
- synchronize_rcu();
- entry = llist_del_all(&q_list);
- llist_for_each_entry_safe(nvmeq, next, entry, node)
nvme_free_queue(nvmeq);
+ }
}
/**
@@ -1408,7 +1458,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
dev->admin_tagset.timeout = ADMIN_TIMEOUT;
dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
- dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
+ dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
dev->admin_tagset.driver_data = dev;
if (blk_mq_alloc_tag_set(&dev->admin_tagset))
@@ -1522,7 +1572,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
}
err = -ENOMEM;
- iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL);
+ iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
if (!iod)
goto put_pages;
@@ -2148,7 +2198,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
dev->tagset.queue_depth =
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
- dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
+ dev->tagset.cmd_size = nvme_cmd_size(dev);
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 79aa179305b5..e22942596207 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -423,7 +423,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
}
/* switch queue to TCQ mode; allocate tag map */
- rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL);
+ rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO);
if (rc) {
blk_cleanup_queue(q);
put_disk(disk);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index cc90a840e616..375d28851860 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -214,6 +214,15 @@ enum blkif_protocol {
BLKIF_PROTOCOL_X86_64 = 3,
};
+/*
+ * Default protocol if the frontend doesn't specify one.
+ */
+#ifdef CONFIG_X86
+# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32
+#else
+# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE
+#endif
+
struct xen_vbd {
/* What the domain refers to this vbd as. */
blkif_vdev_t handle;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 630a489e757d..e3afe97280b1 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -868,11 +868,11 @@ static int connect_ring(struct backend_info *be)
return err;
}
- be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
"%63s", protocol, NULL);
if (err)
- strcpy(protocol, "unspecified, assuming native");
+ strcpy(protocol, "unspecified, assuming default");
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d2cae5fc211a..37779e4c4585 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1511,7 +1511,7 @@ static int blkif_recover(struct blkfront_info *info)
merge_bio.tail = copy[i].request->biotail;
bio_list_merge(&bio_list, &merge_bio);
copy[i].request->bio = NULL;
- blk_put_request(copy[i].request);
+ blk_end_request_all(copy[i].request, 0);
}
kfree(copy);
@@ -1534,7 +1534,7 @@ static int blkif_recover(struct blkfront_info *info)
req->bio = NULL;
if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
pr_alert("diskcache flush request found!\n");
- __blk_put_request(info->rq, req);
+ __blk_end_request_all(req, 0);
}
spin_unlock_irq(&info->io_lock);
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 4c58333b4257..9a6b63783a94 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -287,13 +287,24 @@ static unsigned long get_unmapped_area_mem(struct file *file,
return pgoff << PAGE_SHIFT;
}
+/* permit direct mmap, for read, write or exec */
+static unsigned memory_mmap_capabilities(struct file *file)
+{
+ return NOMMU_MAP_DIRECT |
+ NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
+
+static unsigned zero_mmap_capabilities(struct file *file)
+{
+ return NOMMU_MAP_COPY;
+}
+
/* can't do an in-place private mapping if there's no MMU */
static inline int private_mapping_ok(struct vm_area_struct *vma)
{
return vma->vm_flags & VM_MAYSHARE;
}
#else
-#define get_unmapped_area_mem NULL
static inline int private_mapping_ok(struct vm_area_struct *vma)
{
@@ -721,7 +732,10 @@ static const struct file_operations mem_fops = {
.write = write_mem,
.mmap = mmap_mem,
.open = open_mem,
+#ifndef CONFIG_MMU
.get_unmapped_area = get_unmapped_area_mem,
+ .mmap_capabilities = memory_mmap_capabilities,
+#endif
};
#ifdef CONFIG_DEVKMEM
@@ -731,7 +745,10 @@ static const struct file_operations kmem_fops = {
.write = write_kmem,
.mmap = mmap_kmem,
.open = open_kmem,
+#ifndef CONFIG_MMU
.get_unmapped_area = get_unmapped_area_mem,
+ .mmap_capabilities = memory_mmap_capabilities,
+#endif
};
#endif
@@ -760,16 +777,9 @@ static const struct file_operations zero_fops = {
.read_iter = read_iter_zero,
.aio_write = aio_write_zero,
.mmap = mmap_zero,
-};
-
-/*
- * capabilities for /dev/zero
- * - permits private mappings, "copies" are taken of the source of zeros
- * - no writeback happens
- */
-static struct backing_dev_info zero_bdi = {
- .name = "char/mem",
- .capabilities = BDI_CAP_MAP_COPY | BDI_CAP_NO_ACCT_AND_WRITEBACK,
+#ifndef CONFIG_MMU
+ .mmap_capabilities = zero_mmap_capabilities,
+#endif
};
static const struct file_operations full_fops = {
@@ -783,22 +793,22 @@ static const struct memdev {
const char *name;
umode_t mode;
const struct file_operations *fops;
- struct backing_dev_info *dev_info;
+ fmode_t fmode;
} devlist[] = {
- [1] = { "mem", 0, &mem_fops, &directly_mappable_cdev_bdi },
+ [1] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET },
#ifdef CONFIG_DEVKMEM
- [2] = { "kmem", 0, &kmem_fops, &directly_mappable_cdev_bdi },
+ [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET },
#endif
- [3] = { "null", 0666, &null_fops, NULL },
+ [3] = { "null", 0666, &null_fops, 0 },
#ifdef CONFIG_DEVPORT
- [4] = { "port", 0, &port_fops, NULL },
+ [4] = { "port", 0, &port_fops, 0 },
#endif
- [5] = { "zero", 0666, &zero_fops, &zero_bdi },
- [7] = { "full", 0666, &full_fops, NULL },
- [8] = { "random", 0666, &random_fops, NULL },
- [9] = { "urandom", 0666, &urandom_fops, NULL },
+ [5] = { "zero", 0666, &zero_fops, 0 },
+ [7] = { "full", 0666, &full_fops, 0 },
+ [8] = { "random", 0666, &random_fops, 0 },
+ [9] = { "urandom", 0666, &urandom_fops, 0 },
#ifdef CONFIG_PRINTK
- [11] = { "kmsg", 0644, &kmsg_fops, NULL },
+ [11] = { "kmsg", 0644, &kmsg_fops, 0 },
#endif
};
@@ -816,12 +826,7 @@ static int memory_open(struct inode *inode, struct file *filp)
return -ENXIO;
filp->f_op = dev->fops;
- if (dev->dev_info)
- filp->f_mapping->backing_dev_info = dev->dev_info;
-
- /* Is /dev/mem or /dev/kmem ? */
- if (dev->dev_info == &directly_mappable_cdev_bdi)
- filp->f_mode |= FMODE_UNSIGNED_OFFSET;
+ filp->f_mode |= dev->fmode;
if (dev->fops->open)
return dev->fops->open(inode, filp);
@@ -846,11 +851,6 @@ static struct class *mem_class;
static int __init chr_dev_init(void)
{
int minor;
- int err;
-
- err = bdi_init(&zero_bdi);
- if (err)
- return err;
if (register_chrdev(MEM_MAJOR, "mem", &memory_fops))
printk("unable to get major %d for memory devs\n", MEM_MAJOR);
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index a24891b97547..6e29bf2db536 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -104,11 +104,9 @@ static int raw_release(struct inode *inode, struct file *filp)
mutex_lock(&raw_mutex);
bdev = raw_devices[minor].binding;
- if (--raw_devices[minor].inuse == 0) {
+ if (--raw_devices[minor].inuse == 0)
/* Here inode->i_mapping == bdev->bd_inode->i_mapping */
inode->i_mapping = &inode->i_data;
- inode->i_mapping->backing_dev_info = &default_backing_dev_info;
- }
mutex_unlock(&raw_mutex);
blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index c355a226a024..c39644478aa4 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -231,9 +231,8 @@ config DM_CRYPT
transparently encrypts the data on it. You'll need to activate
the ciphers you're going to use in the cryptoapi configuration.
- Information on how to use dm-crypt can be found on
-
- <http://www.saout.de/misc/dm-crypt/>
+ For further information on dm-crypt and userspace tools see:
+ <http://code.google.com/p/cryptsetup/wiki/DMCrypt>
To compile this code as a module, choose M here: the module will
be called dm-crypt.
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 1695ee5f3ffc..3a5767968ba0 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1619,7 +1619,9 @@ void bitmap_destroy(struct mddev *mddev)
return;
mutex_lock(&mddev->bitmap_info.mutex);
+ spin_lock(&mddev->lock);
mddev->bitmap = NULL; /* disconnect from the md device */
+ spin_unlock(&mddev->lock);
mutex_unlock(&mddev->bitmap_info.mutex);
if (mddev->thread)
mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
@@ -2209,11 +2211,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
static ssize_t can_clear_show(struct mddev *mddev, char *page)
{
int len;
+ spin_lock(&mddev->lock);
if (mddev->bitmap)
len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
"false" : "true"));
else
len = sprintf(page, "\n");
+ spin_unlock(&mddev->lock);
return len;
}
@@ -2238,10 +2242,15 @@ __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
static ssize_t
behind_writes_used_show(struct mddev *mddev, char *page)
{
+ ssize_t ret;
+ spin_lock(&mddev->lock);
if (mddev->bitmap == NULL)
- return sprintf(page, "0\n");
- return sprintf(page, "%lu\n",
- mddev->bitmap->behind_writes_used);
+ ret = sprintf(page, "0\n");
+ else
+ ret = sprintf(page, "%lu\n",
+ mddev->bitmap->behind_writes_used);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index c33b49792b87..86dbbc737402 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -11,6 +11,7 @@
#include <linux/device-mapper.h>
#include <linux/dm-io.h>
#include <linux/slab.h>
+#include <linux/jiffies.h>
#include <linux/vmalloc.h>
#include <linux/shrinker.h>
#include <linux/module.h>
@@ -1739,7 +1740,7 @@ static unsigned get_max_age_hz(void)
static bool older_than(struct dm_buffer *b, unsigned long age_hz)
{
- return (jiffies - b->last_accessed) >= age_hz;
+ return time_after_eq(jiffies, b->last_accessed + age_hz);
}
static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index e1650539cc2f..7755af351867 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -11,6 +11,7 @@
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
+#include <linux/jiffies.h>
#include <linux/init.h>
#include <linux/mempool.h>
#include <linux/module.h>
@@ -1562,8 +1563,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
static int need_commit_due_to_time(struct cache *cache)
{
- return jiffies < cache->last_commit_jiffies ||
- jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+ return !time_in_range(jiffies, cache->last_commit_jiffies,
+ cache->last_commit_jiffies + COMMIT_PERIOD);
}
static int commit_if_needed(struct cache *cache)
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 73f791bb9ea4..c8a18e4ee9dc 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -639,8 +639,8 @@ static int check_name(const char *name)
/*
* On successful return, the caller must not attempt to acquire
- * _hash_lock without first calling dm_table_put, because dm_table_destroy
- * waits for this dm_table_put and could be called under this lock.
+ * _hash_lock without first calling dm_put_live_table, because dm_table_destroy
+ * waits for this dm_put_live_table and could be called under this lock.
*/
static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx)
{
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index b953db6cc229..03177ca0b009 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -6,6 +6,7 @@
#include <linux/bio.h>
#include <linux/slab.h>
+#include <linux/jiffies.h>
#include <linux/dm-dirty-log.h>
#include <linux/device-mapper.h>
#include <linux/dm-log-userspace.h>
@@ -829,7 +830,7 @@ static int userspace_is_remote_recovering(struct dm_dirty_log *log,
int r;
uint64_t region64 = region;
struct log_c *lc = log->context;
- static unsigned long long limit;
+ static unsigned long limit;
struct {
int64_t is_recovering;
uint64_t in_sync_hint;
@@ -845,7 +846,7 @@ static int userspace_is_remote_recovering(struct dm_dirty_log *log,
*/
if (region < lc->in_sync_hint)
return 0;
- else if (jiffies < limit)
+ else if (time_after(limit, jiffies))
return 1;
limit = jiffies + (HZ / 4);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 7b6b0f0f831a..d376dc87716e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -11,6 +11,7 @@
#include "dm-path-selector.h"
#include "dm-uevent.h"
+#include <linux/blkdev.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/mempool.h>
@@ -378,18 +379,18 @@ static int __must_push_back(struct multipath *m)
/*
* Map cloned requests
*/
-static int multipath_map(struct dm_target *ti, struct request *clone,
- union map_info *map_context)
+static int __multipath_map(struct dm_target *ti, struct request *clone,
+ union map_info *map_context,
+ struct request *rq, struct request **__clone)
{
struct multipath *m = (struct multipath *) ti->private;
int r = DM_MAPIO_REQUEUE;
- size_t nr_bytes = blk_rq_bytes(clone);
- unsigned long flags;
+ size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
struct pgpath *pgpath;
struct block_device *bdev;
struct dm_mpath_io *mpio;
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
/* Do we need to select a new pgpath? */
if (!m->current_pgpath ||
@@ -411,25 +412,61 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
/* ENOMEM, requeue */
goto out_unlock;
- bdev = pgpath->path.dev->bdev;
- clone->q = bdev_get_queue(bdev);
- clone->rq_disk = bdev->bd_disk;
- clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
mpio = map_context->ptr;
mpio->pgpath = pgpath;
mpio->nr_bytes = nr_bytes;
+
+ bdev = pgpath->path.dev->bdev;
+
+ spin_unlock_irq(&m->lock);
+
+ if (clone) {
+ /* Old request-based interface: allocated clone is passed in */
+ clone->q = bdev_get_queue(bdev);
+ clone->rq_disk = bdev->bd_disk;
+ clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+ } else {
+ /* blk-mq request-based interface */
+ *__clone = blk_get_request(bdev_get_queue(bdev),
+ rq_data_dir(rq), GFP_KERNEL);
+ if (IS_ERR(*__clone))
+ /* ENOMEM, requeue */
+ return r;
+ (*__clone)->bio = (*__clone)->biotail = NULL;
+ (*__clone)->rq_disk = bdev->bd_disk;
+ (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+ }
+
if (pgpath->pg->ps.type->start_io)
pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
&pgpath->path,
nr_bytes);
- r = DM_MAPIO_REMAPPED;
+ return DM_MAPIO_REMAPPED;
out_unlock:
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
return r;
}
+static int multipath_map(struct dm_target *ti, struct request *clone,
+ union map_info *map_context)
+{
+ return __multipath_map(ti, clone, map_context, NULL, NULL);
+}
+
+static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
+ union map_info *map_context,
+ struct request **clone)
+{
+ return __multipath_map(ti, NULL, map_context, rq, clone);
+}
+
+static void multipath_release_clone(struct request *clone)
+{
+ blk_put_request(clone);
+}
+
/*
* If we run out of usable paths, should we queue I/O or error it?
*/
@@ -1666,11 +1703,13 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 7, 0},
+ .version = {1, 8, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
.map_rq = multipath_map,
+ .clone_and_map_rq = multipath_clone_and_map,
+ .release_clone_rq = multipath_release_clone,
.rq_end_io = multipath_end_io,
.presuspend = multipath_presuspend,
.postsuspend = multipath_postsuspend,
@@ -1694,16 +1733,15 @@ static int __init dm_multipath_init(void)
r = dm_register_target(&multipath_target);
if (r < 0) {
DMERR("register failed %d", r);
- kmem_cache_destroy(_mpio_cache);
- return -EINVAL;
+ r = -EINVAL;
+ goto bad_register_target;
}
kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
if (!kmultipathd) {
DMERR("failed to create workqueue kmpathd");
- dm_unregister_target(&multipath_target);
- kmem_cache_destroy(_mpio_cache);
- return -ENOMEM;
+ r = -ENOMEM;
+ goto bad_alloc_kmultipathd;
}
/*
@@ -1716,16 +1754,23 @@ static int __init dm_multipath_init(void)
WQ_MEM_RECLAIM);
if (!kmpath_handlerd) {
DMERR("failed to create workqueue kmpath_handlerd");
- destroy_workqueue(kmultipathd);
- dm_unregister_target(&multipath_target);
- kmem_cache_destroy(_mpio_cache);
- return -ENOMEM;
+ r = -ENOMEM;
+ goto bad_alloc_kmpath_handlerd;
}
DMINFO("version %u.%u.%u loaded",
multipath_target.version[0], multipath_target.version[1],
multipath_target.version[2]);
+ return 0;
+
+bad_alloc_kmpath_handlerd:
+ destroy_workqueue(kmultipathd);
+bad_alloc_kmultipathd:
+ dm_unregister_target(&multipath_target);
+bad_register_target:
+ kmem_cache_destroy(_mpio_cache);
+
return r;
}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 07c0fa0fa284..88e4c7f24986 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -746,13 +746,7 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
{
struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
- if (rs->raid_type->level == 1)
- return md_raid1_congested(&rs->md, bits);
-
- if (rs->raid_type->level == 10)
- return md_raid10_congested(&rs->md, bits);
-
- return md_raid5_congested(&rs->md, bits);
+ return mddev_congested(&rs->md, bits);
}
/*
@@ -1243,7 +1237,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
argv++;
/* Skip over RAID params for now and find out # of devices */
- if (num_raid_params + 1 > argc) {
+ if (num_raid_params >= argc) {
ti->error = "Arguments do not agree with counts given";
return -EINVAL;
}
@@ -1254,6 +1248,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
return -EINVAL;
}
+ argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
+ if (argc != (num_raid_devs * 2)) {
+ ti->error = "Supplied RAID devices does not match the count given";
+ return -EINVAL;
+ }
+
rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
if (IS_ERR(rs))
return PTR_ERR(rs);
@@ -1262,16 +1262,8 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (ret)
goto bad;
- ret = -EINVAL;
-
- argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
argv += num_raid_params + 1;
- if (argc != (num_raid_devs * 2)) {
- ti->error = "Supplied RAID devices does not match the count given";
- goto bad;
- }
-
ret = dev_parms(rs, argv);
if (ret)
goto bad;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index d6e88178d22c..808b8419bc48 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -200,16 +200,11 @@ err_area:
static void free_area(struct pstore *ps)
{
- if (ps->area)
- vfree(ps->area);
+ vfree(ps->area);
ps->area = NULL;
-
- if (ps->zero_area)
- vfree(ps->zero_area);
+ vfree(ps->zero_area);
ps->zero_area = NULL;
-
- if (ps->header_area)
- vfree(ps->header_area);
+ vfree(ps->header_area);
ps->header_area = NULL;
}
@@ -605,8 +600,7 @@ static void persistent_dtr(struct dm_exception_store *store)
free_area(ps);
/* Allocated in persistent_read_metadata */
- if (ps->callbacks)
- vfree(ps->callbacks);
+ vfree(ps->callbacks);
kfree(ps);
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3afae9e062f8..6554d9148927 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -827,10 +827,11 @@ static int dm_table_set_type(struct dm_table *t)
{
unsigned i;
unsigned bio_based = 0, request_based = 0, hybrid = 0;
+ bool use_blk_mq = false;
struct dm_target *tgt;
struct dm_dev_internal *dd;
struct list_head *devices;
- unsigned live_md_type;
+ unsigned live_md_type = dm_get_md_type(t->md);
for (i = 0; i < t->num_targets; i++) {
tgt = t->targets + i;
@@ -854,8 +855,8 @@ static int dm_table_set_type(struct dm_table *t)
* Determine the type from the live device.
* Default to bio-based if device is new.
*/
- live_md_type = dm_get_md_type(t->md);
- if (live_md_type == DM_TYPE_REQUEST_BASED)
+ if (live_md_type == DM_TYPE_REQUEST_BASED ||
+ live_md_type == DM_TYPE_MQ_REQUEST_BASED)
request_based = 1;
else
bio_based = 1;
@@ -869,16 +870,6 @@ static int dm_table_set_type(struct dm_table *t)
BUG_ON(!request_based); /* No targets in this table */
- /* Non-request-stackable devices can't be used for request-based dm */
- devices = dm_table_get_devices(t);
- list_for_each_entry(dd, devices, list) {
- if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev->bdev))) {
- DMWARN("table load rejected: including"
- " non-request-stackable devices");
- return -EINVAL;
- }
- }
-
/*
* Request-based dm supports only tables that have a single target now.
* To support multiple targets, request splitting support is needed,
@@ -890,7 +881,37 @@ static int dm_table_set_type(struct dm_table *t)
return -EINVAL;
}
- t->type = DM_TYPE_REQUEST_BASED;
+ /* Non-request-stackable devices can't be used for request-based dm */
+ devices = dm_table_get_devices(t);
+ list_for_each_entry(dd, devices, list) {
+ struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
+
+ if (!blk_queue_stackable(q)) {
+ DMERR("table load rejected: including"
+ " non-request-stackable devices");
+ return -EINVAL;
+ }
+
+ if (q->mq_ops)
+ use_blk_mq = true;
+ }
+
+ if (use_blk_mq) {
+ /* verify _all_ devices in the table are blk-mq devices */
+ list_for_each_entry(dd, devices, list)
+ if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) {
+ DMERR("table load rejected: not all devices"
+ " are blk-mq request-stackable");
+ return -EINVAL;
+ }
+ t->type = DM_TYPE_MQ_REQUEST_BASED;
+
+ } else if (hybrid && list_empty(devices) && live_md_type != DM_TYPE_NONE) {
+ /* inherit live MD type */
+ t->type = live_md_type;
+
+ } else
+ t->type = DM_TYPE_REQUEST_BASED;
return 0;
}
@@ -907,7 +928,15 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
bool dm_table_request_based(struct dm_table *t)
{
- return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
+ unsigned table_type = dm_table_get_type(t);
+
+ return (table_type == DM_TYPE_REQUEST_BASED ||
+ table_type == DM_TYPE_MQ_REQUEST_BASED);
+}
+
+bool dm_table_mq_request_based(struct dm_table *t)
+{
+ return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
}
static int dm_table_alloc_md_mempools(struct dm_table *t)
@@ -1360,6 +1389,14 @@ static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
}
+static int queue_supports_sg_gaps(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags);
+}
+
static bool dm_table_all_devices_attribute(struct dm_table *t,
iterate_devices_callout_fn func)
{
@@ -1480,6 +1517,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
else
queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+ if (dm_table_all_devices_attribute(t, queue_supports_sg_gaps))
+ queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, q);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, q);
+
dm_table_set_integrity(t);
/*
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 242e3cec397a..925ec1b15e75 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -137,13 +137,26 @@ static int io_err_map_rq(struct dm_target *ti, struct request *clone,
return -EIO;
}
+static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
+ union map_info *map_context,
+ struct request **clone)
+{
+ return -EIO;
+}
+
+static void io_err_release_clone_rq(struct request *clone)
+{
+}
+
static struct target_type error_target = {
.name = "error",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.ctr = io_err_ctr,
.dtr = io_err_dtr,
.map = io_err_map,
.map_rq = io_err_map_rq,
+ .clone_and_map_rq = io_err_clone_and_map_rq,
+ .release_clone_rq = io_err_release_clone_rq,
};
int __init dm_target_init(void)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 43adbb863f5a..79f694120ddf 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1635,15 +1635,6 @@ int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
return r;
}
-int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
-{
- down_read(&pmd->root_lock);
- *result = pmd->data_block_size;
- up_read(&pmd->root_lock);
-
- return 0;
-}
-
int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
{
int r = -EINVAL;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 921d15ee56a0..fac01a96d303 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -182,8 +182,6 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
dm_block_t *result);
-int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result);
-
int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 07705ee181e3..654773cb1eee 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -11,6 +11,7 @@
#include <linux/device-mapper.h>
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
+#include <linux/jiffies.h>
#include <linux/log2.h>
#include <linux/list.h>
#include <linux/rculist.h>
@@ -1700,8 +1701,8 @@ static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell
*/
static int need_commit_due_to_time(struct pool *pool)
{
- return jiffies < pool->last_commit_jiffies ||
- jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
+ return !time_in_range(jiffies, pool->last_commit_jiffies,
+ pool->last_commit_jiffies + COMMIT_PERIOD);
}
#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2caf5b374649..ec1444f49de1 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -20,6 +20,7 @@
#include <linux/hdreg.h>
#include <linux/delay.h>
#include <linux/wait.h>
+#include <linux/kthread.h>
#include <trace/events/block.h>
@@ -78,7 +79,8 @@ struct dm_io {
struct dm_rq_target_io {
struct mapped_device *md;
struct dm_target *ti;
- struct request *orig, clone;
+ struct request *orig, *clone;
+ struct kthread_work work;
int error;
union map_info info;
};
@@ -179,6 +181,7 @@ struct mapped_device {
* io objects are allocated from here.
*/
mempool_t *io_pool;
+ mempool_t *rq_pool;
struct bio_set *bs;
@@ -210,6 +213,9 @@ struct mapped_device {
unsigned internal_suspend_count;
struct dm_stats stats;
+
+ struct kthread_worker kworker;
+ struct task_struct *kworker_task;
};
/*
@@ -217,6 +223,7 @@ struct mapped_device {
*/
struct dm_md_mempools {
mempool_t *io_pool;
+ mempool_t *rq_pool;
struct bio_set *bs;
};
@@ -231,6 +238,7 @@ struct table_device {
#define RESERVED_MAX_IOS 1024
static struct kmem_cache *_io_cache;
static struct kmem_cache *_rq_tio_cache;
+static struct kmem_cache *_rq_cache;
/*
* Bio-based DM's mempools' reserved IOs set by the user.
@@ -288,9 +296,14 @@ static int __init local_init(void)
if (!_rq_tio_cache)
goto out_free_io_cache;
+ _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request),
+ __alignof__(struct request), 0, NULL);
+ if (!_rq_cache)
+ goto out_free_rq_tio_cache;
+
r = dm_uevent_init();
if (r)
- goto out_free_rq_tio_cache;
+ goto out_free_rq_cache;
deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
if (!deferred_remove_workqueue) {
@@ -312,6 +325,8 @@ out_free_workqueue:
destroy_workqueue(deferred_remove_workqueue);
out_uevent_exit:
dm_uevent_exit();
+out_free_rq_cache:
+ kmem_cache_destroy(_rq_cache);
out_free_rq_tio_cache:
kmem_cache_destroy(_rq_tio_cache);
out_free_io_cache:
@@ -325,6 +340,7 @@ static void local_exit(void)
flush_scheduled_work();
destroy_workqueue(deferred_remove_workqueue);
+ kmem_cache_destroy(_rq_cache);
kmem_cache_destroy(_rq_tio_cache);
kmem_cache_destroy(_io_cache);
unregister_blkdev(_major, _name);
@@ -577,6 +593,17 @@ static void free_rq_tio(struct dm_rq_target_io *tio)
mempool_free(tio, tio->md->io_pool);
}
+static struct request *alloc_clone_request(struct mapped_device *md,
+ gfp_t gfp_mask)
+{
+ return mempool_alloc(md->rq_pool, gfp_mask);
+}
+
+static void free_clone_request(struct mapped_device *md, struct request *rq)
+{
+ mempool_free(rq, md->rq_pool);
+}
+
static int md_in_flight(struct mapped_device *md)
{
return atomic_read(&md->pending[READ]) +
@@ -992,7 +1019,7 @@ static void end_clone_bio(struct bio *clone, int error)
* the md may be freed in dm_put() at the end of this function.
* Or do dm_get() before calling this function and dm_put() later.
*/
-static void rq_completed(struct mapped_device *md, int rw, int run_queue)
+static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
{
atomic_dec(&md->pending[rw]);
@@ -1020,12 +1047,17 @@ static void free_rq_clone(struct request *clone)
struct dm_rq_target_io *tio = clone->end_io_data;
blk_rq_unprep_clone(clone);
+ if (clone->q && clone->q->mq_ops)
+ tio->ti->type->release_clone_rq(clone);
+ else
+ free_clone_request(tio->md, clone);
free_rq_tio(tio);
}
/*
* Complete the clone and the original request.
- * Must be called without queue lock.
+ * Must be called without clone's queue lock held,
+ * see end_clone_request() for more details.
*/
static void dm_end_request(struct request *clone, int error)
{
@@ -1054,23 +1086,23 @@ static void dm_end_request(struct request *clone, int error)
static void dm_unprep_request(struct request *rq)
{
- struct request *clone = rq->special;
+ struct dm_rq_target_io *tio = rq->special;
+ struct request *clone = tio->clone;
rq->special = NULL;
rq->cmd_flags &= ~REQ_DONTPREP;
- free_rq_clone(clone);
+ if (clone)
+ free_rq_clone(clone);
}
/*
* Requeue the original request of a clone.
*/
-void dm_requeue_unmapped_request(struct request *clone)
+static void dm_requeue_unmapped_original_request(struct mapped_device *md,
+ struct request *rq)
{
- int rw = rq_data_dir(clone);
- struct dm_rq_target_io *tio = clone->end_io_data;
- struct mapped_device *md = tio->md;
- struct request *rq = tio->orig;
+ int rw = rq_data_dir(rq);
struct request_queue *q = rq->q;
unsigned long flags;
@@ -1080,9 +1112,15 @@ void dm_requeue_unmapped_request(struct request *clone)
blk_requeue_request(q, rq);
spin_unlock_irqrestore(q->queue_lock, flags);
- rq_completed(md, rw, 0);
+ rq_completed(md, rw, false);
+}
+
+static void dm_requeue_unmapped_request(struct request *clone)
+{
+ struct dm_rq_target_io *tio = clone->end_io_data;
+
+ dm_requeue_unmapped_original_request(tio->md, tio->orig);
}
-EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
static void __stop_queue(struct request_queue *q)
{
@@ -1151,8 +1189,15 @@ static void dm_done(struct request *clone, int error, bool mapped)
static void dm_softirq_done(struct request *rq)
{
bool mapped = true;
- struct request *clone = rq->completion_data;
- struct dm_rq_target_io *tio = clone->end_io_data;
+ struct dm_rq_target_io *tio = rq->special;
+ struct request *clone = tio->clone;
+
+ if (!clone) {
+ blk_end_request_all(rq, tio->error);
+ rq_completed(tio->md, rq_data_dir(rq), false);
+ free_rq_tio(tio);
+ return;
+ }
if (rq->cmd_flags & REQ_FAILED)
mapped = false;
@@ -1164,13 +1209,11 @@ static void dm_softirq_done(struct request *rq)
* Complete the clone and the original request with the error status
* through softirq context.
*/
-static void dm_complete_request(struct request *clone, int error)
+static void dm_complete_request(struct request *rq, int error)
{
- struct dm_rq_target_io *tio = clone->end_io_data;
- struct request *rq = tio->orig;
+ struct dm_rq_target_io *tio = rq->special;
tio->error = error;
- rq->completion_data = clone;
blk_complete_request(rq);
}
@@ -1178,40 +1221,40 @@ static void dm_complete_request(struct request *clone, int error)
* Complete the not-mapped clone and the original request with the error status
* through softirq context.
* Target's rq_end_io() function isn't called.
- * This may be used when the target's map_rq() function fails.
+ * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
*/
-void dm_kill_unmapped_request(struct request *clone, int error)
+static void dm_kill_unmapped_request(struct request *rq, int error)
{
- struct dm_rq_target_io *tio = clone->end_io_data;
- struct request *rq = tio->orig;
-
rq->cmd_flags |= REQ_FAILED;
- dm_complete_request(clone, error);
+ dm_complete_request(rq, error);
}
-EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
/*
- * Called with the queue lock held
+ * Called with the clone's queue lock held
*/
static void end_clone_request(struct request *clone, int error)
{
- /*
- * For just cleaning up the information of the queue in which
- * the clone was dispatched.
- * The clone is *NOT* freed actually here because it is alloced from
- * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
- */
- __blk_put_request(clone->q, clone);
+ struct dm_rq_target_io *tio = clone->end_io_data;
+
+ if (!clone->q->mq_ops) {
+ /*
+ * For just cleaning up the information of the queue in which
+ * the clone was dispatched.
+ * The clone is *NOT* freed actually here because it is alloced
+ * from dm own mempool (REQ_ALLOCED isn't set).
+ */
+ __blk_put_request(clone->q, clone);
+ }
/*
* Actual request completion is done in a softirq context which doesn't
- * hold the queue lock. Otherwise, deadlock could occur because:
+ * hold the clone's queue lock. Otherwise, deadlock could occur because:
* - another request may be submitted by the upper level driver
* of the stacking during the completion
* - the submission which requires queue lock may be done
- * against this queue
+ * against this clone's queue
*/
- dm_complete_request(clone, error);
+ dm_complete_request(tio->orig, error);
}
/*
@@ -1689,19 +1732,19 @@ static void dm_request(struct request_queue *q, struct bio *bio)
_dm_request(q, bio);
}
-void dm_dispatch_request(struct request *rq)
+static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
{
int r;
- if (blk_queue_io_stat(rq->q))
- rq->cmd_flags |= REQ_IO_STAT;
+ if (blk_queue_io_stat(clone->q))
+ clone->cmd_flags |= REQ_IO_STAT;
- rq->start_time = jiffies;
- r = blk_insert_cloned_request(rq->q, rq);
+ clone->start_time = jiffies;
+ r = blk_insert_cloned_request(clone->q, clone);
if (r)
+ /* must complete clone in terms of original request */
dm_complete_request(rq, r);
}
-EXPORT_SYMBOL_GPL(dm_dispatch_request);
static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
void *data)
@@ -1718,11 +1761,11 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
}
static int setup_clone(struct request *clone, struct request *rq,
- struct dm_rq_target_io *tio)
+ struct dm_rq_target_io *tio, gfp_t gfp_mask)
{
int r;
- r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+ r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
dm_rq_bio_constructor, tio);
if (r)
return r;
@@ -1733,14 +1776,37 @@ static int setup_clone(struct request *clone, struct request *rq,
clone->end_io = end_clone_request;
clone->end_io_data = tio;
+ tio->clone = clone;
+
return 0;
}
static struct request *clone_rq(struct request *rq, struct mapped_device *md,
- gfp_t gfp_mask)
+ struct dm_rq_target_io *tio, gfp_t gfp_mask)
+{
+ struct request *clone = alloc_clone_request(md, gfp_mask);
+
+ if (!clone)
+ return NULL;
+
+ blk_rq_init(NULL, clone);
+ if (setup_clone(clone, rq, tio, gfp_mask)) {
+ /* -ENOMEM */
+ free_clone_request(md, clone);
+ return NULL;
+ }
+
+ return clone;
+}
+
+static void map_tio_request(struct kthread_work *work);
+
+static struct dm_rq_target_io *prep_tio(struct request *rq,
+ struct mapped_device *md, gfp_t gfp_mask)
{
- struct request *clone;
struct dm_rq_target_io *tio;
+ int srcu_idx;
+ struct dm_table *table;
tio = alloc_rq_tio(md, gfp_mask);
if (!tio)
@@ -1748,18 +1814,23 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
tio->md = md;
tio->ti = NULL;
+ tio->clone = NULL;
tio->orig = rq;
tio->error = 0;
memset(&tio->info, 0, sizeof(tio->info));
-
- clone = &tio->clone;
- if (setup_clone(clone, rq, tio)) {
- /* -ENOMEM */
- free_rq_tio(tio);
- return NULL;
+ init_kthread_work(&tio->work, map_tio_request);
+
+ table = dm_get_live_table(md, &srcu_idx);
+ if (!dm_table_mq_request_based(table)) {
+ if (!clone_rq(rq, md, tio, gfp_mask)) {
+ dm_put_live_table(md, srcu_idx);
+ free_rq_tio(tio);
+ return NULL;
+ }
}
+ dm_put_live_table(md, srcu_idx);
- return clone;
+ return tio;
}
/*
@@ -1768,18 +1839,18 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
static int dm_prep_fn(struct request_queue *q, struct request *rq)
{
struct mapped_device *md = q->queuedata;
- struct request *clone;
+ struct dm_rq_target_io *tio;
if (unlikely(rq->special)) {
DMWARN("Already has something in rq->special.");
return BLKPREP_KILL;
}
- clone = clone_rq(rq, md, GFP_ATOMIC);
- if (!clone)
+ tio = prep_tio(rq, md, GFP_ATOMIC);
+ if (!tio)
return BLKPREP_DEFER;
- rq->special = clone;
+ rq->special = tio;
rq->cmd_flags |= REQ_DONTPREP;
return BLKPREP_OK;
@@ -1787,17 +1858,36 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
/*
* Returns:
- * 0 : the request has been processed (not requeued)
- * !0 : the request has been requeued
+ * 0 : the request has been processed
+ * DM_MAPIO_REQUEUE : the original request needs to be requeued
+ * < 0 : the request was completed due to failure
*/
-static int map_request(struct dm_target *ti, struct request *clone,
+static int map_request(struct dm_target *ti, struct request *rq,
struct mapped_device *md)
{
- int r, requeued = 0;
- struct dm_rq_target_io *tio = clone->end_io_data;
+ int r;
+ struct dm_rq_target_io *tio = rq->special;
+ struct request *clone = NULL;
+
+ if (tio->clone) {
+ clone = tio->clone;
+ r = ti->type->map_rq(ti, clone, &tio->info);
+ } else {
+ r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+ if (r < 0) {
+ /* The target wants to complete the I/O */
+ dm_kill_unmapped_request(rq, r);
+ return r;
+ }
+ if (IS_ERR(clone))
+ return DM_MAPIO_REQUEUE;
+ if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
+ /* -ENOMEM */
+ ti->type->release_clone_rq(clone);
+ return DM_MAPIO_REQUEUE;
+ }
+ }
- tio->ti = ti;
- r = ti->type->map_rq(ti, clone, &tio->info);
switch (r) {
case DM_MAPIO_SUBMITTED:
/* The target has taken the I/O to submit by itself later */
@@ -1805,13 +1895,12 @@ static int map_request(struct dm_target *ti, struct request *clone,
case DM_MAPIO_REMAPPED:
/* The target has remapped the I/O so dispatch it */
trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
- blk_rq_pos(tio->orig));
- dm_dispatch_request(clone);
+ blk_rq_pos(rq));
+ dm_dispatch_clone_request(clone, rq);
break;
case DM_MAPIO_REQUEUE:
/* The target wants to requeue the I/O */
dm_requeue_unmapped_request(clone);
- requeued = 1;
break;
default:
if (r > 0) {
@@ -1820,20 +1909,27 @@ static int map_request(struct dm_target *ti, struct request *clone,
}
/* The target wants to complete the I/O */
- dm_kill_unmapped_request(clone, r);
- break;
+ dm_kill_unmapped_request(rq, r);
+ return r;
}
- return requeued;
+ return 0;
}
-static struct request *dm_start_request(struct mapped_device *md, struct request *orig)
+static void map_tio_request(struct kthread_work *work)
{
- struct request *clone;
+ struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
+ struct request *rq = tio->orig;
+ struct mapped_device *md = tio->md;
+ if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
+ dm_requeue_unmapped_original_request(md, rq);
+}
+
+static void dm_start_request(struct mapped_device *md, struct request *orig)
+{
blk_start_request(orig);
- clone = orig->special;
- atomic_inc(&md->pending[rq_data_dir(clone)]);
+ atomic_inc(&md->pending[rq_data_dir(orig)]);
/*
* Hold the md reference here for the in-flight I/O.
@@ -1843,8 +1939,6 @@ static struct request *dm_start_request(struct mapped_device *md, struct request
* See the comment in rq_completed() too.
*/
dm_get(md);
-
- return clone;
}
/*
@@ -1857,7 +1951,8 @@ static void dm_request_fn(struct request_queue *q)
int srcu_idx;
struct dm_table *map = dm_get_live_table(md, &srcu_idx);
struct dm_target *ti;
- struct request *rq, *clone;
+ struct request *rq;
+ struct dm_rq_target_io *tio;
sector_t pos;
/*
@@ -1879,34 +1974,29 @@ static void dm_request_fn(struct request_queue *q)
ti = dm_table_find_target(map, pos);
if (!dm_target_is_valid(ti)) {
/*
- * Must perform setup, that dm_done() requires,
+ * Must perform setup, that rq_completed() requires,
* before calling dm_kill_unmapped_request
*/
DMERR_LIMIT("request attempted access beyond the end of device");
- clone = dm_start_request(md, rq);
- dm_kill_unmapped_request(clone, -EIO);
+ dm_start_request(md, rq);
+ dm_kill_unmapped_request(rq, -EIO);
continue;
}
if (ti->type->busy && ti->type->busy(ti))
goto delay_and_out;
- clone = dm_start_request(md, rq);
-
- spin_unlock(q->queue_lock);
- if (map_request(ti, clone, md))
- goto requeued;
+ dm_start_request(md, rq);
+ tio = rq->special;
+ /* Establish tio->ti before queuing work (map_tio_request) */
+ tio->ti = ti;
+ queue_kthread_work(&md->kworker, &tio->work);
BUG_ON(!irqs_disabled());
- spin_lock(q->queue_lock);
}
goto out;
-requeued:
- BUG_ON(!irqs_disabled());
- spin_lock(q->queue_lock);
-
delay_and_out:
blk_delay_queue(q, HZ / 10);
out:
@@ -2092,6 +2182,7 @@ static struct mapped_device *alloc_dev(int minor)
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
init_completion(&md->kobj_holder.completion);
+ md->kworker_task = NULL;
md->disk->major = _major;
md->disk->first_minor = minor;
@@ -2152,8 +2243,13 @@ static void free_dev(struct mapped_device *md)
unlock_fs(md);
bdput(md->bdev);
destroy_workqueue(md->wq);
+
+ if (md->kworker_task)
+ kthread_stop(md->kworker_task);
if (md->io_pool)
mempool_destroy(md->io_pool);
+ if (md->rq_pool)
+ mempool_destroy(md->rq_pool);
if (md->bs)
bioset_free(md->bs);
blk_integrity_unregister(md->disk);
@@ -2187,23 +2283,24 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
bioset_free(md->bs);
md->bs = p->bs;
p->bs = NULL;
- } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) {
- /*
- * There's no need to reload with request-based dm
- * because the size of front_pad doesn't change.
- * Note for future: If you are to reload bioset,
- * prep-ed requests in the queue may refer
- * to bio from the old bioset, so you must walk
- * through the queue to unprep.
- */
}
+ /*
+ * There's no need to reload with request-based dm
+ * because the size of front_pad doesn't change.
+ * Note for future: If you are to reload bioset,
+ * prep-ed requests in the queue may refer
+ * to bio from the old bioset, so you must walk
+ * through the queue to unprep.
+ */
goto out;
}
- BUG_ON(!p || md->io_pool || md->bs);
+ BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
md->io_pool = p->io_pool;
p->io_pool = NULL;
+ md->rq_pool = p->rq_pool;
+ p->rq_pool = NULL;
md->bs = p->bs;
p->bs = NULL;
@@ -2406,6 +2503,14 @@ unsigned dm_get_md_type(struct mapped_device *md)
return md->type;
}
+static bool dm_md_type_request_based(struct mapped_device *md)
+{
+ unsigned table_type = dm_get_md_type(md);
+
+ return (table_type == DM_TYPE_REQUEST_BASED ||
+ table_type == DM_TYPE_MQ_REQUEST_BASED);
+}
+
struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
{
return md->immutable_target_type;
@@ -2443,6 +2548,11 @@ static int dm_init_request_based_queue(struct mapped_device *md)
blk_queue_prep_rq(md->queue, dm_prep_fn);
blk_queue_lld_busy(md->queue, dm_lld_busy);
+ /* Also initialize the request-based DM worker thread */
+ init_kthread_worker(&md->kworker);
+ md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
+ "kdmwork-%s", dm_device_name(md));
+
elv_register_queue(md->queue);
return 1;
@@ -2453,8 +2563,7 @@ static int dm_init_request_based_queue(struct mapped_device *md)
*/
int dm_setup_md_queue(struct mapped_device *md)
{
- if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
- !dm_init_request_based_queue(md)) {
+ if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) {
DMWARN("Cannot initialize queue for request-based mapped device");
return -EINVAL;
}
@@ -2533,6 +2642,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
set_bit(DMF_FREEING, &md->flags);
spin_unlock(&_minor_lock);
+ if (dm_request_based(md))
+ flush_kthread_worker(&md->kworker);
+
if (!dm_suspended_md(md)) {
dm_table_presuspend_targets(map);
dm_table_postsuspend_targets(map);
@@ -2776,8 +2888,10 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
* Stop md->queue before flushing md->wq in case request-based
* dm defers requests to md->wq from md->queue.
*/
- if (dm_request_based(md))
+ if (dm_request_based(md)) {
stop_queue(md->queue);
+ flush_kthread_worker(&md->kworker);
+ }
flush_workqueue(md->wq);
@@ -3123,24 +3237,35 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
{
struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
struct kmem_cache *cachep;
- unsigned int pool_size;
+ unsigned int pool_size = 0;
unsigned int front_pad;
if (!pools)
return NULL;
- if (type == DM_TYPE_BIO_BASED) {
+ switch (type) {
+ case DM_TYPE_BIO_BASED:
cachep = _io_cache;
pool_size = dm_get_reserved_bio_based_ios();
front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
- } else if (type == DM_TYPE_REQUEST_BASED) {
- cachep = _rq_tio_cache;
+ break;
+ case DM_TYPE_REQUEST_BASED:
pool_size = dm_get_reserved_rq_based_ios();
+ pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+ if (!pools->rq_pool)
+ goto out;
+ /* fall through to setup remaining rq-based pools */
+ case DM_TYPE_MQ_REQUEST_BASED:
+ cachep = _rq_tio_cache;
+ if (!pool_size)
+ pool_size = dm_get_reserved_rq_based_ios();
front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
/* per_bio_data_size is not used. See __bind_mempools(). */
WARN_ON(per_bio_data_size != 0);
- } else
+ break;
+ default:
goto out;
+ }
pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
if (!pools->io_pool)
@@ -3169,6 +3294,9 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
if (pools->io_pool)
mempool_destroy(pools->io_pool);
+ if (pools->rq_pool)
+ mempool_destroy(pools->rq_pool);
+
if (pools->bs)
bioset_free(pools->bs);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 84b0f9e4ba6c..59f53e79db82 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -34,9 +34,10 @@
/*
* Type of table and mapped_device's mempool
*/
-#define DM_TYPE_NONE 0
-#define DM_TYPE_BIO_BASED 1
-#define DM_TYPE_REQUEST_BASED 2
+#define DM_TYPE_NONE 0
+#define DM_TYPE_BIO_BASED 1
+#define DM_TYPE_REQUEST_BASED 2
+#define DM_TYPE_MQ_REQUEST_BASED 3
/*
* List of devices that a metadevice uses and should open/close.
@@ -73,6 +74,7 @@ int dm_table_any_busy_target(struct dm_table *t);
unsigned dm_table_get_type(struct dm_table *t);
struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
+bool dm_table_mq_request_based(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
@@ -99,7 +101,8 @@ int dm_setup_md_queue(struct mapped_device *md);
/*
* To check whether the target type is request-based or not (bio-based).
*/
-#define dm_target_request_based(t) ((t)->type->map_rq != NULL)
+#define dm_target_request_based(t) (((t)->type->map_rq != NULL) || \
+ ((t)->type->clone_and_map_rq != NULL))
/*
* To check whether the target type is a hybrid (capable of being
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index e8b4574956c7..1277eb26b58a 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -332,13 +332,11 @@ static int run(struct mddev *mddev)
return 0;
}
-static int stop(struct mddev *mddev)
+static void faulty_free(struct mddev *mddev, void *priv)
{
- struct faulty_conf *conf = mddev->private;
+ struct faulty_conf *conf = priv;
kfree(conf);
- mddev->private = NULL;
- return 0;
}
static struct md_personality faulty_personality =
@@ -348,7 +346,7 @@ static struct md_personality faulty_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = faulty_free,
.status = status,
.check_reshape = reshape,
.size = faulty_size,
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 64713b77df1c..fa7d577f3d12 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -34,7 +34,7 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
lo = 0;
hi = mddev->raid_disks - 1;
- conf = rcu_dereference(mddev->private);
+ conf = mddev->private;
/*
* Binary Search
@@ -60,18 +60,16 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
*
* Return amount of bytes we can take at this offset
*/
-static int linear_mergeable_bvec(struct request_queue *q,
+static int linear_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
struct dev_info *dev0;
unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int maxbytes = biovec->bv_len;
struct request_queue *subq;
- rcu_read_lock();
dev0 = which_dev(mddev, sector);
maxsectors = dev0->end_sector - sector;
subq = bdev_get_queue(dev0->rdev->bdev);
@@ -81,7 +79,6 @@ static int linear_mergeable_bvec(struct request_queue *q,
maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
biovec));
}
- rcu_read_unlock();
if (maxsectors < bio_sectors)
maxsectors = 0;
@@ -97,24 +94,18 @@ static int linear_mergeable_bvec(struct request_queue *q,
return maxsectors << 9;
}
-static int linear_congested(void *data, int bits)
+static int linear_congested(struct mddev *mddev, int bits)
{
- struct mddev *mddev = data;
struct linear_conf *conf;
int i, ret = 0;
- if (mddev_congested(mddev, bits))
- return 1;
-
- rcu_read_lock();
- conf = rcu_dereference(mddev->private);
+ conf = mddev->private;
for (i = 0; i < mddev->raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
ret |= bdi_congested(&q->backing_dev_info, bits);
}
- rcu_read_unlock();
return ret;
}
@@ -123,12 +114,10 @@ static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disk
struct linear_conf *conf;
sector_t array_sectors;
- rcu_read_lock();
- conf = rcu_dereference(mddev->private);
+ conf = mddev->private;
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
array_sectors = conf->array_sectors;
- rcu_read_unlock();
return array_sectors;
}
@@ -217,10 +206,6 @@ static int linear_run (struct mddev *mddev)
mddev->private = conf;
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
- blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
- mddev->queue->backing_dev_info.congested_fn = linear_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
-
ret = md_integrity_register(mddev);
if (ret) {
kfree(conf);
@@ -252,38 +237,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
if (!newconf)
return -ENOMEM;
- oldconf = rcu_dereference_protected(mddev->private,
- lockdep_is_held(
- &mddev->reconfig_mutex));
+ mddev_suspend(mddev);
+ oldconf = mddev->private;
mddev->raid_disks++;
- rcu_assign_pointer(mddev->private, newconf);
+ mddev->private = newconf;
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
+ mddev_resume(mddev);
revalidate_disk(mddev->gendisk);
- kfree_rcu(oldconf, rcu);
+ kfree(oldconf);
return 0;
}
-static int linear_stop (struct mddev *mddev)
+static void linear_free(struct mddev *mddev, void *priv)
{
- struct linear_conf *conf =
- rcu_dereference_protected(mddev->private,
- lockdep_is_held(
- &mddev->reconfig_mutex));
+ struct linear_conf *conf = priv;
- /*
- * We do not require rcu protection here since
- * we hold reconfig_mutex for both linear_add and
- * linear_stop, so they cannot race.
- * We should make sure any old 'conf's are properly
- * freed though.
- */
- rcu_barrier();
- blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
kfree(conf);
- mddev->private = NULL;
-
- return 0;
}
static void linear_make_request(struct mddev *mddev, struct bio *bio)
@@ -299,16 +269,12 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
}
do {
- rcu_read_lock();
-
tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
end_sector = tmp_dev->end_sector;
data_offset = tmp_dev->rdev->data_offset;
bio->bi_bdev = tmp_dev->rdev->bdev;
- rcu_read_unlock();
-
if (unlikely(bio->bi_iter.bi_sector >= end_sector ||
bio->bi_iter.bi_sector < start_sector))
goto out_of_bounds;
@@ -355,6 +321,10 @@ static void linear_status (struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
}
+static void linear_quiesce(struct mddev *mddev, int state)
+{
+}
+
static struct md_personality linear_personality =
{
.name = "linear",
@@ -362,10 +332,13 @@ static struct md_personality linear_personality =
.owner = THIS_MODULE,
.make_request = linear_make_request,
.run = linear_run,
- .stop = linear_stop,
+ .free = linear_free,
.status = linear_status,
.hot_add_disk = linear_add,
.size = linear_size,
+ .quiesce = linear_quiesce,
+ .congested = linear_congested,
+ .mergeable_bvec = linear_mergeable_bvec,
};
static int __init linear_init (void)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 709755fb6d7b..c8d2bac4e28b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -72,6 +72,7 @@ static struct workqueue_struct *md_misc_wq;
static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
+static void mddev_detach(struct mddev *mddev);
/*
* Default number of read corrections we'll attempt on an rdev
@@ -292,8 +293,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
/* mddev_suspend makes sure no new requests are submitted
* to the device, and that any requests that have been submitted
* are completely handled.
- * Once ->stop is called and completes, the module will be completely
- * unused.
+ * Once mddev_detach() is called and completes, the module will be
+ * completely unused.
*/
void mddev_suspend(struct mddev *mddev)
{
@@ -321,10 +322,47 @@ EXPORT_SYMBOL_GPL(mddev_resume);
int mddev_congested(struct mddev *mddev, int bits)
{
- return mddev->suspended;
+ struct md_personality *pers = mddev->pers;
+ int ret = 0;
+
+ rcu_read_lock();
+ if (mddev->suspended)
+ ret = 1;
+ else if (pers && pers->congested)
+ ret = pers->congested(mddev, bits);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mddev_congested);
+static int md_congested(void *data, int bits)
+{
+ struct mddev *mddev = data;
+ return mddev_congested(mddev, bits);
}
-EXPORT_SYMBOL(mddev_congested);
+static int md_mergeable_bvec(struct request_queue *q,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ struct mddev *mddev = q->queuedata;
+ int ret;
+ rcu_read_lock();
+ if (mddev->suspended) {
+ /* Must always allow one vec */
+ if (bvm->bi_size == 0)
+ ret = biovec->bv_len;
+ else
+ ret = 0;
+ } else {
+ struct md_personality *pers = mddev->pers;
+ if (pers && pers->mergeable_bvec)
+ ret = pers->mergeable_bvec(mddev, bvm, biovec);
+ else
+ ret = biovec->bv_len;
+ }
+ rcu_read_unlock();
+ return ret;
+}
/*
* Generic flush handling for md
*/
@@ -397,12 +435,12 @@ static void md_submit_flush_data(struct work_struct *ws)
void md_flush_request(struct mddev *mddev, struct bio *bio)
{
- spin_lock_irq(&mddev->write_lock);
+ spin_lock_irq(&mddev->lock);
wait_event_lock_irq(mddev->sb_wait,
!mddev->flush_bio,
- mddev->write_lock);
+ mddev->lock);
mddev->flush_bio = bio;
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock_irq(&mddev->lock);
INIT_WORK(&mddev->flush_work, submit_flushes);
queue_work(md_wq, &mddev->flush_work);
@@ -465,7 +503,7 @@ void mddev_init(struct mddev *mddev)
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
- spin_lock_init(&mddev->write_lock);
+ spin_lock_init(&mddev->lock);
atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait);
@@ -552,32 +590,9 @@ static struct mddev *mddev_find(dev_t unit)
goto retry;
}
-static inline int __must_check mddev_lock(struct mddev *mddev)
-{
- return mutex_lock_interruptible(&mddev->reconfig_mutex);
-}
-
-/* Sometimes we need to take the lock in a situation where
- * failure due to interrupts is not acceptable.
- */
-static inline void mddev_lock_nointr(struct mddev *mddev)
-{
- mutex_lock(&mddev->reconfig_mutex);
-}
-
-static inline int mddev_is_locked(struct mddev *mddev)
-{
- return mutex_is_locked(&mddev->reconfig_mutex);
-}
-
-static inline int mddev_trylock(struct mddev *mddev)
-{
- return mutex_trylock(&mddev->reconfig_mutex);
-}
-
static struct attribute_group md_redundancy_group;
-static void mddev_unlock(struct mddev *mddev)
+void mddev_unlock(struct mddev *mddev)
{
if (mddev->to_remove) {
/* These cannot be removed under reconfig_mutex as
@@ -619,6 +634,7 @@ static void mddev_unlock(struct mddev *mddev)
md_wakeup_thread(mddev->thread);
spin_unlock(&pers_lock);
}
+EXPORT_SYMBOL_GPL(mddev_unlock);
static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
{
@@ -2230,7 +2246,7 @@ repeat:
return;
}
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
mddev->utime = get_seconds();
@@ -2287,7 +2303,7 @@ repeat:
}
sync_sbs(mddev, nospares);
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
mdname(mddev), mddev->in_sync);
@@ -2326,15 +2342,15 @@ repeat:
md_super_wait(mddev);
/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (mddev->in_sync != sync_req ||
test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
/* have to write it out again */
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
goto repeat;
}
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
wake_up(&mddev->sb_wait);
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
@@ -2381,40 +2397,41 @@ state_show(struct md_rdev *rdev, char *page)
{
char *sep = "";
size_t len = 0;
+ unsigned long flags = ACCESS_ONCE(rdev->flags);
- if (test_bit(Faulty, &rdev->flags) ||
+ if (test_bit(Faulty, &flags) ||
rdev->badblocks.unacked_exist) {
len+= sprintf(page+len, "%sfaulty",sep);
sep = ",";
}
- if (test_bit(In_sync, &rdev->flags)) {
+ if (test_bit(In_sync, &flags)) {
len += sprintf(page+len, "%sin_sync",sep);
sep = ",";
}
- if (test_bit(WriteMostly, &rdev->flags)) {
+ if (test_bit(WriteMostly, &flags)) {
len += sprintf(page+len, "%swrite_mostly",sep);
sep = ",";
}
- if (test_bit(Blocked, &rdev->flags) ||
+ if (test_bit(Blocked, &flags) ||
(rdev->badblocks.unacked_exist
- && !test_bit(Faulty, &rdev->flags))) {
+ && !test_bit(Faulty, &flags))) {
len += sprintf(page+len, "%sblocked", sep);
sep = ",";
}
- if (!test_bit(Faulty, &rdev->flags) &&
- !test_bit(In_sync, &rdev->flags)) {
+ if (!test_bit(Faulty, &flags) &&
+ !test_bit(In_sync, &flags)) {
len += sprintf(page+len, "%sspare", sep);
sep = ",";
}
- if (test_bit(WriteErrorSeen, &rdev->flags)) {
+ if (test_bit(WriteErrorSeen, &flags)) {
len += sprintf(page+len, "%swrite_error", sep);
sep = ",";
}
- if (test_bit(WantReplacement, &rdev->flags)) {
+ if (test_bit(WantReplacement, &flags)) {
len += sprintf(page+len, "%swant_replacement", sep);
sep = ",";
}
- if (test_bit(Replacement, &rdev->flags)) {
+ if (test_bit(Replacement, &flags)) {
len += sprintf(page+len, "%sreplacement", sep);
sep = ",";
}
@@ -2927,21 +2944,12 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
- struct mddev *mddev = rdev->mddev;
- ssize_t rv;
if (!entry->show)
return -EIO;
-
- rv = mddev ? mddev_lock(mddev) : -EBUSY;
- if (!rv) {
- if (rdev->mddev == NULL)
- rv = -EBUSY;
- else
- rv = entry->show(rdev, page);
- mddev_unlock(mddev);
- }
- return rv;
+ if (!rdev->mddev)
+ return -EBUSY;
+ return entry->show(rdev, page);
}
static ssize_t
@@ -3212,11 +3220,13 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
mddev->safemode_delay = 0;
else {
unsigned long old_delay = mddev->safemode_delay;
- mddev->safemode_delay = (msec*HZ)/1000;
- if (mddev->safemode_delay == 0)
- mddev->safemode_delay = 1;
- if (mddev->safemode_delay < old_delay || old_delay == 0)
- md_safemode_timeout((unsigned long)mddev);
+ unsigned long new_delay = (msec*HZ)/1000;
+
+ if (new_delay == 0)
+ new_delay = 1;
+ mddev->safemode_delay = new_delay;
+ if (new_delay < old_delay || old_delay == 0)
+ mod_timer(&mddev->safemode_timer, jiffies+1);
}
return len;
}
@@ -3226,41 +3236,52 @@ __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
static ssize_t
level_show(struct mddev *mddev, char *page)
{
- struct md_personality *p = mddev->pers;
+ struct md_personality *p;
+ int ret;
+ spin_lock(&mddev->lock);
+ p = mddev->pers;
if (p)
- return sprintf(page, "%s\n", p->name);
+ ret = sprintf(page, "%s\n", p->name);
else if (mddev->clevel[0])
- return sprintf(page, "%s\n", mddev->clevel);
+ ret = sprintf(page, "%s\n", mddev->clevel);
else if (mddev->level != LEVEL_NONE)
- return sprintf(page, "%d\n", mddev->level);
+ ret = sprintf(page, "%d\n", mddev->level);
else
- return 0;
+ ret = 0;
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
level_store(struct mddev *mddev, const char *buf, size_t len)
{
char clevel[16];
- ssize_t rv = len;
- struct md_personality *pers;
+ ssize_t rv;
+ size_t slen = len;
+ struct md_personality *pers, *oldpers;
long level;
- void *priv;
+ void *priv, *oldpriv;
struct md_rdev *rdev;
+ if (slen == 0 || slen >= sizeof(clevel))
+ return -EINVAL;
+
+ rv = mddev_lock(mddev);
+ if (rv)
+ return rv;
+
if (mddev->pers == NULL) {
- if (len == 0)
- return 0;
- if (len >= sizeof(mddev->clevel))
- return -ENOSPC;
- strncpy(mddev->clevel, buf, len);
- if (mddev->clevel[len-1] == '\n')
- len--;
- mddev->clevel[len] = 0;
+ strncpy(mddev->clevel, buf, slen);
+ if (mddev->clevel[slen-1] == '\n')
+ slen--;
+ mddev->clevel[slen] = 0;
mddev->level = LEVEL_NONE;
- return rv;
+ rv = len;
+ goto out_unlock;
}
+ rv = -EROFS;
if (mddev->ro)
- return -EROFS;
+ goto out_unlock;
/* request to change the personality. Need to ensure:
* - array is not engaged in resync/recovery/reshape
@@ -3268,25 +3289,25 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
* - new personality will access other array.
*/
+ rv = -EBUSY;
if (mddev->sync_thread ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
mddev->reshape_position != MaxSector ||
mddev->sysfs_active)
- return -EBUSY;
+ goto out_unlock;
+ rv = -EINVAL;
if (!mddev->pers->quiesce) {
printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
mdname(mddev), mddev->pers->name);
- return -EINVAL;
+ goto out_unlock;
}
/* Now find the new personality */
- if (len == 0 || len >= sizeof(clevel))
- return -EINVAL;
- strncpy(clevel, buf, len);
- if (clevel[len-1] == '\n')
- len--;
- clevel[len] = 0;
+ strncpy(clevel, buf, slen);
+ if (clevel[slen-1] == '\n')
+ slen--;
+ clevel[slen] = 0;
if (kstrtol(clevel, 10, &level))
level = LEVEL_NONE;
@@ -3297,20 +3318,23 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
if (!pers || !try_module_get(pers->owner)) {
spin_unlock(&pers_lock);
printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
- return -EINVAL;
+ rv = -EINVAL;
+ goto out_unlock;
}
spin_unlock(&pers_lock);
if (pers == mddev->pers) {
/* Nothing to do! */
module_put(pers->owner);
- return rv;
+ rv = len;
+ goto out_unlock;
}
if (!pers->takeover) {
module_put(pers->owner);
printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
mdname(mddev), clevel);
- return -EINVAL;
+ rv = -EINVAL;
+ goto out_unlock;
}
rdev_for_each(rdev, mddev)
@@ -3330,30 +3354,29 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
module_put(pers->owner);
printk(KERN_WARNING "md: %s: %s would not accept array\n",
mdname(mddev), clevel);
- return PTR_ERR(priv);
+ rv = PTR_ERR(priv);
+ goto out_unlock;
}
/* Looks like we have a winner */
mddev_suspend(mddev);
- mddev->pers->stop(mddev);
+ mddev_detach(mddev);
- if (mddev->pers->sync_request == NULL &&
- pers->sync_request != NULL) {
- /* need to add the md_redundancy_group */
- if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
- printk(KERN_WARNING
- "md: cannot register extra attributes for %s\n",
- mdname(mddev));
- mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
- }
- if (mddev->pers->sync_request != NULL &&
- pers->sync_request == NULL) {
- /* need to remove the md_redundancy_group */
- if (mddev->to_remove == NULL)
- mddev->to_remove = &md_redundancy_group;
- }
+ spin_lock(&mddev->lock);
+ oldpers = mddev->pers;
+ oldpriv = mddev->private;
+ mddev->pers = pers;
+ mddev->private = priv;
+ strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ mddev->level = mddev->new_level;
+ mddev->layout = mddev->new_layout;
+ mddev->chunk_sectors = mddev->new_chunk_sectors;
+ mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
+ mddev->degraded = 0;
+ spin_unlock(&mddev->lock);
- if (mddev->pers->sync_request == NULL &&
+ if (oldpers->sync_request == NULL &&
mddev->external) {
/* We are converting from a no-redundancy array
* to a redundancy array and metadata is managed
@@ -3367,6 +3390,24 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->safemode = 0;
}
+ oldpers->free(mddev, oldpriv);
+
+ if (oldpers->sync_request == NULL &&
+ pers->sync_request != NULL) {
+ /* need to add the md_redundancy_group */
+ if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
+ printk(KERN_WARNING
+ "md: cannot register extra attributes for %s\n",
+ mdname(mddev));
+ mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
+ }
+ if (oldpers->sync_request != NULL &&
+ pers->sync_request == NULL) {
+ /* need to remove the md_redundancy_group */
+ if (mddev->to_remove == NULL)
+ mddev->to_remove = &md_redundancy_group;
+ }
+
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0)
continue;
@@ -3392,17 +3433,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
}
}
- module_put(mddev->pers->owner);
- mddev->pers = pers;
- mddev->private = priv;
- strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
- mddev->level = mddev->new_level;
- mddev->layout = mddev->new_layout;
- mddev->chunk_sectors = mddev->new_chunk_sectors;
- mddev->delta_disks = 0;
- mddev->reshape_backwards = 0;
- mddev->degraded = 0;
- if (mddev->pers->sync_request == NULL) {
+ if (pers->sync_request == NULL) {
/* this is now an array without redundancy, so
* it must always be in_sync
*/
@@ -3417,6 +3448,9 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
md_update_sb(mddev, 1);
sysfs_notify(&mddev->kobj, NULL, "level");
md_new_event(mddev);
+ rv = len;
+out_unlock:
+ mddev_unlock(mddev);
return rv;
}
@@ -3439,28 +3473,32 @@ layout_store(struct mddev *mddev, const char *buf, size_t len)
{
char *e;
unsigned long n = simple_strtoul(buf, &e, 10);
+ int err;
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers) {
- int err;
if (mddev->pers->check_reshape == NULL)
- return -EBUSY;
- if (mddev->ro)
- return -EROFS;
- mddev->new_layout = n;
- err = mddev->pers->check_reshape(mddev);
- if (err) {
- mddev->new_layout = mddev->layout;
- return err;
+ err = -EBUSY;
+ else if (mddev->ro)
+ err = -EROFS;
+ else {
+ mddev->new_layout = n;
+ err = mddev->pers->check_reshape(mddev);
+ if (err)
+ mddev->new_layout = mddev->layout;
}
} else {
mddev->new_layout = n;
if (mddev->reshape_position == MaxSector)
mddev->layout = n;
}
- return len;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_layout =
__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
@@ -3483,32 +3521,39 @@ static ssize_t
raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
{
char *e;
- int rv = 0;
+ int err;
unsigned long n = simple_strtoul(buf, &e, 10);
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers)
- rv = update_raid_disks(mddev, n);
+ err = update_raid_disks(mddev, n);
else if (mddev->reshape_position != MaxSector) {
struct md_rdev *rdev;
int olddisks = mddev->raid_disks - mddev->delta_disks;
+ err = -EINVAL;
rdev_for_each(rdev, mddev) {
if (olddisks < n &&
rdev->data_offset < rdev->new_data_offset)
- return -EINVAL;
+ goto out_unlock;
if (olddisks > n &&
rdev->data_offset > rdev->new_data_offset)
- return -EINVAL;
+ goto out_unlock;
}
+ err = 0;
mddev->delta_disks = n - olddisks;
mddev->raid_disks = n;
mddev->reshape_backwards = (mddev->delta_disks < 0);
} else
mddev->raid_disks = n;
- return rv ? rv : len;
+out_unlock:
+ mddev_unlock(mddev);
+ return err ? err : len;
}
static struct md_sysfs_entry md_raid_disks =
__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
@@ -3527,30 +3572,34 @@ chunk_size_show(struct mddev *mddev, char *page)
static ssize_t
chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
{
+ int err;
char *e;
unsigned long n = simple_strtoul(buf, &e, 10);
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers) {
- int err;
if (mddev->pers->check_reshape == NULL)
- return -EBUSY;
- if (mddev->ro)
- return -EROFS;
- mddev->new_chunk_sectors = n >> 9;
- err = mddev->pers->check_reshape(mddev);
- if (err) {
- mddev->new_chunk_sectors = mddev->chunk_sectors;
- return err;
+ err = -EBUSY;
+ else if (mddev->ro)
+ err = -EROFS;
+ else {
+ mddev->new_chunk_sectors = n >> 9;
+ err = mddev->pers->check_reshape(mddev);
+ if (err)
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
}
} else {
mddev->new_chunk_sectors = n >> 9;
if (mddev->reshape_position == MaxSector)
mddev->chunk_sectors = n >> 9;
}
- return len;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_chunk_size =
__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
@@ -3566,20 +3615,27 @@ resync_start_show(struct mddev *mddev, char *page)
static ssize_t
resync_start_store(struct mddev *mddev, const char *buf, size_t len)
{
+ int err;
char *e;
unsigned long long n = simple_strtoull(buf, &e, 10);
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
- return -EBUSY;
- if (cmd_match(buf, "none"))
+ err = -EBUSY;
+ else if (cmd_match(buf, "none"))
n = MaxSector;
else if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = -EINVAL;
- mddev->recovery_cp = n;
- if (mddev->pers)
- set_bit(MD_CHANGE_CLEAN, &mddev->flags);
- return len;
+ if (!err) {
+ mddev->recovery_cp = n;
+ if (mddev->pers)
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ }
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_resync_start =
__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
@@ -3677,8 +3733,39 @@ static int restart_array(struct mddev *mddev);
static ssize_t
array_state_store(struct mddev *mddev, const char *buf, size_t len)
{
- int err = -EINVAL;
+ int err;
enum array_state st = match_word(buf, array_states);
+
+ if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
+ /* don't take reconfig_mutex when toggling between
+ * clean and active
+ */
+ spin_lock(&mddev->lock);
+ if (st == active) {
+ restart_array(mddev);
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ wake_up(&mddev->sb_wait);
+ err = 0;
+ } else /* st == clean */ {
+ restart_array(mddev);
+ if (atomic_read(&mddev->writes_pending) == 0) {
+ if (mddev->in_sync == 0) {
+ mddev->in_sync = 1;
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ }
+ err = 0;
+ } else
+ err = -EBUSY;
+ }
+ spin_unlock(&mddev->lock);
+ return err;
+ }
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EINVAL;
switch(st) {
case bad_word:
break;
@@ -3722,7 +3809,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
case clean:
if (mddev->pers) {
restart_array(mddev);
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (atomic_read(&mddev->writes_pending) == 0) {
if (mddev->in_sync == 0) {
mddev->in_sync = 1;
@@ -3733,7 +3820,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err = 0;
} else
err = -EBUSY;
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
} else
err = -EINVAL;
break;
@@ -3754,14 +3841,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
/* these cannot be set */
break;
}
- if (err)
- return err;
- else {
+
+ if (!err) {
if (mddev->hold_active == UNTIL_IOCTL)
mddev->hold_active = 0;
sysfs_notify_dirent_safe(mddev->sysfs_state);
- return len;
}
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_array_state =
__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
@@ -3822,6 +3909,11 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev))
return -EOVERFLOW;
+ flush_workqueue(md_misc_wq);
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->persistent) {
rdev = md_import_device(dev, mddev->major_version,
mddev->minor_version);
@@ -3845,6 +3937,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
out:
if (err)
export_rdev(rdev);
+ mddev_unlock(mddev);
return err ? err : len;
}
@@ -3856,7 +3949,11 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
{
char *end;
unsigned long chunk, end_chunk;
+ int err;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (!mddev->bitmap)
goto out;
/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
@@ -3874,6 +3971,7 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
}
bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
out:
+ mddev_unlock(mddev);
return len;
}
@@ -3901,6 +3999,9 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
if (err < 0)
return err;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers) {
err = update_size(mddev, sectors);
md_update_sb(mddev, 1);
@@ -3911,6 +4012,7 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
else
err = -ENOSPC;
}
+ mddev_unlock(mddev);
return err ? err : len;
}
@@ -3940,21 +4042,28 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len)
{
int major, minor;
char *e;
+ int err;
/* Changing the details of 'external' metadata is
* always permitted. Otherwise there must be
* no devices attached to the array.
*/
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EBUSY;
if (mddev->external && strncmp(buf, "external:", 9) == 0)
;
else if (!list_empty(&mddev->disks))
- return -EBUSY;
+ goto out_unlock;
+ err = 0;
if (cmd_match(buf, "none")) {
mddev->persistent = 0;
mddev->external = 0;
mddev->major_version = 0;
mddev->minor_version = 90;
- return len;
+ goto out_unlock;
}
if (strncmp(buf, "external:", 9) == 0) {
size_t namelen = len-9;
@@ -3968,22 +4077,27 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len)
mddev->external = 1;
mddev->major_version = 0;
mddev->minor_version = 90;
- return len;
+ goto out_unlock;
}
major = simple_strtoul(buf, &e, 10);
+ err = -EINVAL;
if (e==buf || *e != '.')
- return -EINVAL;
+ goto out_unlock;
buf = e+1;
minor = simple_strtoul(buf, &e, 10);
if (e==buf || (*e && *e != '\n') )
- return -EINVAL;
+ goto out_unlock;
+ err = -ENOENT;
if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
- return -ENOENT;
+ goto out_unlock;
mddev->major_version = major;
mddev->minor_version = minor;
mddev->persistent = 1;
mddev->external = 0;
- return len;
+ err = 0;
+out_unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_metadata =
@@ -3993,20 +4107,21 @@ static ssize_t
action_show(struct mddev *mddev, char *page)
{
char *type = "idle";
- if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+ unsigned long recovery = mddev->recovery;
+ if (test_bit(MD_RECOVERY_FROZEN, &recovery))
type = "frozen";
- else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
- (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
- if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
+ (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
+ if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
type = "reshape";
- else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
- if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
+ if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
type = "resync";
- else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+ else if (test_bit(MD_RECOVERY_CHECK, &recovery))
type = "check";
else
type = "repair";
- } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
+ } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
type = "recover";
}
return sprintf(page, "%s\n", type);
@@ -4027,7 +4142,10 @@ action_store(struct mddev *mddev, const char *page, size_t len)
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_reap_sync_thread(mddev);
+ if (mddev_lock(mddev) == 0) {
+ md_reap_sync_thread(mddev);
+ mddev_unlock(mddev);
+ }
}
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -4041,7 +4159,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
int err;
if (mddev->pers->start_reshape == NULL)
return -EINVAL;
- err = mddev->pers->start_reshape(mddev);
+ err = mddev_lock(mddev);
+ if (!err) {
+ err = mddev->pers->start_reshape(mddev);
+ mddev_unlock(mddev);
+ }
if (err)
return err;
sysfs_notify(&mddev->kobj, NULL, "degraded");
@@ -4225,22 +4347,36 @@ static ssize_t
min_sync_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned long long min;
+ int err;
+ int chunk;
+
if (kstrtoull(buf, 10, &min))
return -EINVAL;
+
+ spin_lock(&mddev->lock);
+ err = -EINVAL;
if (min > mddev->resync_max)
- return -EINVAL;
+ goto out_unlock;
+
+ err = -EBUSY;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- return -EBUSY;
+ goto out_unlock;
/* Must be a multiple of chunk_size */
- if (mddev->chunk_sectors) {
+ chunk = mddev->chunk_sectors;
+ if (chunk) {
sector_t temp = min;
- if (sector_div(temp, mddev->chunk_sectors))
- return -EINVAL;
+
+ err = -EINVAL;
+ if (sector_div(temp, chunk))
+ goto out_unlock;
}
mddev->resync_min = min;
+ err = 0;
- return len;
+out_unlock:
+ spin_unlock(&mddev->lock);
+ return err ?: len;
}
static struct md_sysfs_entry md_min_sync =
@@ -4258,29 +4394,42 @@ max_sync_show(struct mddev *mddev, char *page)
static ssize_t
max_sync_store(struct mddev *mddev, const char *buf, size_t len)
{
+ int err;
+ spin_lock(&mddev->lock);
if (strncmp(buf, "max", 3) == 0)
mddev->resync_max = MaxSector;
else {
unsigned long long max;
+ int chunk;
+
+ err = -EINVAL;
if (kstrtoull(buf, 10, &max))
- return -EINVAL;
+ goto out_unlock;
if (max < mddev->resync_min)
- return -EINVAL;
+ goto out_unlock;
+
+ err = -EBUSY;
if (max < mddev->resync_max &&
mddev->ro == 0 &&
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- return -EBUSY;
+ goto out_unlock;
/* Must be a multiple of chunk_size */
- if (mddev->chunk_sectors) {
+ chunk = mddev->chunk_sectors;
+ if (chunk) {
sector_t temp = max;
- if (sector_div(temp, mddev->chunk_sectors))
- return -EINVAL;
+
+ err = -EINVAL;
+ if (sector_div(temp, chunk))
+ goto out_unlock;
}
mddev->resync_max = max;
}
wake_up(&mddev->recovery_wait);
- return len;
+ err = 0;
+out_unlock:
+ spin_unlock(&mddev->lock);
+ return err ?: len;
}
static struct md_sysfs_entry md_max_sync =
@@ -4297,14 +4446,20 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old = mddev->suspend_lo;
+ unsigned long long old;
+ int err;
- if (mddev->pers == NULL ||
- mddev->pers->quiesce == NULL)
- return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EINVAL;
+ if (mddev->pers == NULL ||
+ mddev->pers->quiesce == NULL)
+ goto unlock;
+ old = mddev->suspend_lo;
mddev->suspend_lo = new;
if (new >= old)
/* Shrinking suspended region */
@@ -4314,7 +4469,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
- return len;
+ err = 0;
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4330,14 +4488,20 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old = mddev->suspend_hi;
+ unsigned long long old;
+ int err;
- if (mddev->pers == NULL ||
- mddev->pers->quiesce == NULL)
- return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EINVAL;
+ if (mddev->pers == NULL ||
+ mddev->pers->quiesce == NULL)
+ goto unlock;
+ old = mddev->suspend_hi;
mddev->suspend_hi = new;
if (new <= old)
/* Shrinking suspended region */
@@ -4347,7 +4511,10 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
- return len;
+ err = 0;
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -4367,11 +4534,17 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
{
struct md_rdev *rdev;
char *e;
+ int err;
unsigned long long new = simple_strtoull(buf, &e, 10);
- if (mddev->pers)
- return -EBUSY;
+
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EBUSY;
+ if (mddev->pers)
+ goto unlock;
mddev->reshape_position = new;
mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
@@ -4380,7 +4553,10 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
mddev->new_chunk_sectors = mddev->chunk_sectors;
rdev_for_each(rdev, mddev)
rdev->new_data_offset = rdev->data_offset;
- return len;
+ err = 0;
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_reshape_position =
@@ -4398,6 +4574,8 @@ static ssize_t
reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
{
int backwards = 0;
+ int err;
+
if (cmd_match(buf, "forwards"))
backwards = 0;
else if (cmd_match(buf, "backwards"))
@@ -4407,16 +4585,19 @@ reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->reshape_backwards == backwards)
return len;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
/* check if we are allowed to change */
if (mddev->delta_disks)
- return -EBUSY;
-
- if (mddev->persistent &&
+ err = -EBUSY;
+ else if (mddev->persistent &&
mddev->major_version == 0)
- return -EINVAL;
-
- mddev->reshape_backwards = backwards;
- return len;
+ err = -EINVAL;
+ else
+ mddev->reshape_backwards = backwards;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_reshape_direction =
@@ -4437,6 +4618,11 @@ static ssize_t
array_size_store(struct mddev *mddev, const char *buf, size_t len)
{
sector_t sectors;
+ int err;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (strncmp(buf, "default", 7) == 0) {
if (mddev->pers)
@@ -4447,19 +4633,22 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len)
mddev->external_size = 0;
} else {
if (strict_blocks_to_sectors(buf, &sectors) < 0)
- return -EINVAL;
- if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
- return -E2BIG;
-
- mddev->external_size = 1;
+ err = -EINVAL;
+ else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
+ err = -E2BIG;
+ else
+ mddev->external_size = 1;
}
- mddev->array_sectors = sectors;
- if (mddev->pers) {
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ if (!err) {
+ mddev->array_sectors = sectors;
+ if (mddev->pers) {
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ }
}
- return len;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_array_size =
@@ -4523,11 +4712,7 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
- rv = mddev_lock(mddev);
- if (!rv) {
- rv = entry->show(mddev, page);
- mddev_unlock(mddev);
- }
+ rv = entry->show(mddev, page);
mddev_put(mddev);
return rv;
}
@@ -4551,13 +4736,7 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
}
mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
- if (entry->store == new_dev_store)
- flush_workqueue(md_misc_wq);
- rv = mddev_lock(mddev);
- if (!rv) {
- rv = entry->store(mddev, page, length);
- mddev_unlock(mddev);
- }
+ rv = entry->store(mddev, page, length);
mddev_put(mddev);
return rv;
}
@@ -4825,7 +5004,6 @@ int md_run(struct mddev *mddev)
mddev->clevel);
return -EINVAL;
}
- mddev->pers = pers;
spin_unlock(&pers_lock);
if (mddev->level != pers->level) {
mddev->level = pers->level;
@@ -4836,7 +5014,6 @@ int md_run(struct mddev *mddev)
if (mddev->reshape_position != MaxSector &&
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
- mddev->pers = NULL;
module_put(pers->owner);
return -EINVAL;
}
@@ -4880,35 +5057,38 @@ int md_run(struct mddev *mddev)
if (start_readonly && mddev->ro == 0)
mddev->ro = 2; /* read-only, but switch on first write */
- err = mddev->pers->run(mddev);
+ err = pers->run(mddev);
if (err)
printk(KERN_ERR "md: pers->run() failed ...\n");
- else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
+ else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
WARN_ONCE(!mddev->external_size, "%s: default size too small,"
" but 'external_size' not in effect?\n", __func__);
printk(KERN_ERR
"md: invalid array_size %llu > default size %llu\n",
(unsigned long long)mddev->array_sectors / 2,
- (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
+ (unsigned long long)pers->size(mddev, 0, 0) / 2);
err = -EINVAL;
- mddev->pers->stop(mddev);
}
- if (err == 0 && mddev->pers->sync_request &&
+ if (err == 0 && pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
err = bitmap_create(mddev);
- if (err) {
+ if (err)
printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
- mddev->pers->stop(mddev);
- }
}
if (err) {
- module_put(mddev->pers->owner);
- mddev->pers = NULL;
+ mddev_detach(mddev);
+ pers->free(mddev, mddev->private);
+ module_put(pers->owner);
bitmap_destroy(mddev);
return err;
}
- if (mddev->pers->sync_request) {
+ if (mddev->queue) {
+ mddev->queue->backing_dev_info.congested_data = mddev;
+ mddev->queue->backing_dev_info.congested_fn = md_congested;
+ blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec);
+ }
+ if (pers->sync_request) {
if (mddev->kobj.sd &&
sysfs_create_group(&mddev->kobj, &md_redundancy_group))
printk(KERN_WARNING
@@ -4927,7 +5107,10 @@ int md_run(struct mddev *mddev)
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
smp_wmb();
+ spin_lock(&mddev->lock);
+ mddev->pers = pers;
mddev->ready = 1;
+ spin_unlock(&mddev->lock);
rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0)
if (sysfs_link_rdev(mddev, rdev))
@@ -5070,14 +5253,38 @@ void md_stop_writes(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(md_stop_writes);
+static void mddev_detach(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+ /* wait for behind writes to complete */
+ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+ printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n",
+ mdname(mddev));
+ /* need to kick something here to make sure I/O goes? */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
+ if (mddev->pers && mddev->pers->quiesce) {
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ md_unregister_thread(&mddev->thread);
+ if (mddev->queue)
+ blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+}
+
static void __md_stop(struct mddev *mddev)
{
+ struct md_personality *pers = mddev->pers;
+ mddev_detach(mddev);
+ spin_lock(&mddev->lock);
mddev->ready = 0;
- mddev->pers->stop(mddev);
- if (mddev->pers->sync_request && mddev->to_remove == NULL)
- mddev->to_remove = &md_redundancy_group;
- module_put(mddev->pers->owner);
mddev->pers = NULL;
+ spin_unlock(&mddev->lock);
+ pers->free(mddev, mddev->private);
+ if (pers->sync_request && mddev->to_remove == NULL)
+ mddev->to_remove = &md_redundancy_group;
+ module_put(pers->owner);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
@@ -5226,8 +5433,11 @@ static int do_md_stop(struct mddev *mddev, int mode,
bitmap_destroy(mddev);
if (mddev->bitmap_info.file) {
- fput(mddev->bitmap_info.file);
+ struct file *f = mddev->bitmap_info.file;
+ spin_lock(&mddev->lock);
mddev->bitmap_info.file = NULL;
+ spin_unlock(&mddev->lock);
+ fput(f);
}
mddev->bitmap_info.offset = 0;
@@ -5436,37 +5646,31 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
static int get_bitmap_file(struct mddev *mddev, void __user * arg)
{
mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
- char *ptr, *buf = NULL;
- int err = -ENOMEM;
+ char *ptr;
+ int err;
file = kmalloc(sizeof(*file), GFP_NOIO);
-
if (!file)
- goto out;
+ return -ENOMEM;
+ err = 0;
+ spin_lock(&mddev->lock);
/* bitmap disabled, zero the first byte and copy out */
- if (!mddev->bitmap || !mddev->bitmap->storage.file) {
+ if (!mddev->bitmap_info.file)
file->pathname[0] = '\0';
- goto copy_out;
- }
-
- buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
- if (!buf)
- goto out;
-
- ptr = d_path(&mddev->bitmap->storage.file->f_path,
- buf, sizeof(file->pathname));
- if (IS_ERR(ptr))
- goto out;
-
- strcpy(file->pathname, ptr);
+ else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
+ file->pathname, sizeof(file->pathname))),
+ IS_ERR(ptr))
+ err = PTR_ERR(ptr);
+ else
+ memmove(file->pathname, ptr,
+ sizeof(file->pathname)-(ptr-file->pathname));
+ spin_unlock(&mddev->lock);
-copy_out:
- err = 0;
- if (copy_to_user(arg, file, sizeof(*file)))
+ if (err == 0 &&
+ copy_to_user(arg, file, sizeof(*file)))
err = -EFAULT;
-out:
- kfree(buf);
+
kfree(file);
return err;
}
@@ -5789,22 +5993,24 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
if (fd >= 0) {
struct inode *inode;
- if (mddev->bitmap)
+ struct file *f;
+
+ if (mddev->bitmap || mddev->bitmap_info.file)
return -EEXIST; /* cannot add when bitmap is present */
- mddev->bitmap_info.file = fget(fd);
+ f = fget(fd);
- if (mddev->bitmap_info.file == NULL) {
+ if (f == NULL) {
printk(KERN_ERR "%s: error: failed to get bitmap file\n",
mdname(mddev));
return -EBADF;
}
- inode = mddev->bitmap_info.file->f_mapping->host;
+ inode = f->f_mapping->host;
if (!S_ISREG(inode->i_mode)) {
printk(KERN_ERR "%s: error: bitmap file must be a regular file\n",
mdname(mddev));
err = -EBADF;
- } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) {
+ } else if (!(f->f_mode & FMODE_WRITE)) {
printk(KERN_ERR "%s: error: bitmap file must open for write\n",
mdname(mddev));
err = -EBADF;
@@ -5814,10 +6020,10 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
err = -EBUSY;
}
if (err) {
- fput(mddev->bitmap_info.file);
- mddev->bitmap_info.file = NULL;
+ fput(f);
return err;
}
+ mddev->bitmap_info.file = f;
mddev->bitmap_info.offset = 0; /* file overrides offset */
} else if (mddev->bitmap == NULL)
return -ENOENT; /* cannot remove what isn't there */
@@ -5836,9 +6042,13 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
mddev->pers->quiesce(mddev, 0);
}
if (fd < 0) {
- if (mddev->bitmap_info.file)
- fput(mddev->bitmap_info.file);
- mddev->bitmap_info.file = NULL;
+ struct file *f = mddev->bitmap_info.file;
+ if (f) {
+ spin_lock(&mddev->lock);
+ mddev->bitmap_info.file = NULL;
+ spin_unlock(&mddev->lock);
+ fput(f);
+ }
}
return err;
@@ -6251,6 +6461,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
case SET_DISK_FAULTY:
err = set_disk_faulty(mddev, new_decode_dev(arg));
goto out;
+
+ case GET_BITMAP_FILE:
+ err = get_bitmap_file(mddev, argp);
+ goto out;
+
}
if (cmd == ADD_NEW_DISK)
@@ -6342,10 +6557,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
* Commands even a read-only array can execute:
*/
switch (cmd) {
- case GET_BITMAP_FILE:
- err = get_bitmap_file(mddev, argp);
- goto unlock;
-
case RESTART_ARRAY_RW:
err = restart_array(mddev);
goto unlock;
@@ -6873,9 +7084,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
return 0;
}
- if (mddev_lock(mddev) < 0)
- return -EINTR;
-
+ spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : %sactive", mdname(mddev),
mddev->pers ? "" : "in");
@@ -6888,7 +7097,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
}
sectors = 0;
- rdev_for_each(rdev, mddev) {
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
char b[BDEVNAME_SIZE];
seq_printf(seq, " %s[%d]",
bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -6904,6 +7114,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "(R)");
sectors += rdev->sectors;
}
+ rcu_read_unlock();
if (!list_empty(&mddev->disks)) {
if (mddev->pers)
@@ -6946,7 +7157,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
- mddev_unlock(mddev);
+ spin_unlock(&mddev->lock);
return 0;
}
@@ -7102,7 +7313,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
if (mddev->safemode == 1)
mddev->safemode = 0;
if (mddev->in_sync) {
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
@@ -7110,7 +7321,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
md_wakeup_thread(mddev->thread);
did_change = 1;
}
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
}
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
@@ -7148,7 +7359,7 @@ int md_allow_write(struct mddev *mddev)
if (!mddev->pers->sync_request)
return 0;
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
@@ -7156,11 +7367,11 @@ int md_allow_write(struct mddev *mddev)
if (mddev->safemode_delay &&
mddev->safemode == 0)
mddev->safemode = 1;
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
md_update_sb(mddev, 0);
sysfs_notify_dirent_safe(mddev->sysfs_state);
} else
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
return -EAGAIN;
@@ -7513,6 +7724,7 @@ void md_do_sync(struct md_thread *thread)
skip:
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
@@ -7521,6 +7733,8 @@ void md_do_sync(struct md_thread *thread)
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
mddev->curr_resync = 0;
+ spin_unlock(&mddev->lock);
+
wake_up(&resync_wait);
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -7688,7 +7902,7 @@ void md_check_recovery(struct mddev *mddev)
if (!mddev->external) {
int did_change = 0;
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (mddev->safemode &&
!atomic_read(&mddev->writes_pending) &&
!mddev->in_sync &&
@@ -7699,7 +7913,7 @@ void md_check_recovery(struct mddev *mddev)
}
if (mddev->safemode == 1)
mddev->safemode = 0;
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
@@ -7721,7 +7935,9 @@ void md_check_recovery(struct mddev *mddev)
* any transients in the value of "sync_action".
*/
mddev->curr_resync_completed = 0;
+ spin_lock(&mddev->lock);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ spin_unlock(&mddev->lock);
/* Clear some bits that don't mean anything, but
* might be left set
*/
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 03cec5bdcaae..318ca8fd430f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -386,7 +386,18 @@ struct mddev {
struct work_struct del_work; /* used for delayed sysfs removal */
- spinlock_t write_lock;
+ /* "lock" protects:
+ * flush_bio transition from NULL to !NULL
+ * rdev superblocks, events
+ * clearing MD_CHANGE_*
+ * in_sync - and related safemode and MD_CHANGE changes
+ * pers (also protected by reconfig_mutex and pending IO).
+ * clearing ->bitmap
+ * clearing ->bitmap_info.file
+ * changing ->resync_{min,max}
+ * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max})
+ */
+ spinlock_t lock;
wait_queue_head_t sb_wait; /* for waiting on superblock updates */
atomic_t pending_writes; /* number of active superblock writes */
@@ -439,13 +450,30 @@ struct mddev {
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
};
-static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
+static inline int __must_check mddev_lock(struct mddev *mddev)
{
- int faulty = test_bit(Faulty, &rdev->flags);
- if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ return mutex_lock_interruptible(&mddev->reconfig_mutex);
+}
+
+/* Sometimes we need to take the lock in a situation where
+ * failure due to interrupts is not acceptable.
+ */
+static inline void mddev_lock_nointr(struct mddev *mddev)
+{
+ mutex_lock(&mddev->reconfig_mutex);
+}
+
+static inline int mddev_is_locked(struct mddev *mddev)
+{
+ return mutex_is_locked(&mddev->reconfig_mutex);
}
+static inline int mddev_trylock(struct mddev *mddev)
+{
+ return mutex_trylock(&mddev->reconfig_mutex);
+}
+extern void mddev_unlock(struct mddev *mddev);
+
static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
{
atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
@@ -459,7 +487,7 @@ struct md_personality
struct module *owner;
void (*make_request)(struct mddev *mddev, struct bio *bio);
int (*run)(struct mddev *mddev);
- int (*stop)(struct mddev *mddev);
+ void (*free)(struct mddev *mddev, void *priv);
void (*status)(struct seq_file *seq, struct mddev *mddev);
/* error_handler must set ->faulty and clear ->in_sync
* if appropriate, and should abort recovery if needed
@@ -490,6 +518,13 @@ struct md_personality
* array.
*/
void *(*takeover) (struct mddev *mddev);
+ /* congested implements bdi.congested_fn().
+ * Will not be called while array is 'suspended' */
+ int (*congested)(struct mddev *mddev, int bits);
+ /* mergeable_bvec is use to implement ->merge_bvec_fn */
+ int (*mergeable_bvec)(struct mddev *mddev,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec);
};
struct md_sysfs_entry {
@@ -624,4 +659,14 @@ static inline int mddev_check_plugged(struct mddev *mddev)
return !!blk_check_plugged(md_unplug, mddev,
sizeof(struct blk_plug_cb));
}
+
+static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
+{
+ int faulty = test_bit(Faulty, &rdev->flags);
+ if (atomic_dec_and_test(&rdev->nr_pending) && faulty) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+}
+
#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 399272f9c042..ac3ede2bd00e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -153,15 +153,11 @@ static void multipath_status (struct seq_file *seq, struct mddev *mddev)
seq_printf (seq, "]");
}
-static int multipath_congested(void *data, int bits)
+static int multipath_congested(struct mddev *mddev, int bits)
{
- struct mddev *mddev = data;
struct mpconf *conf = mddev->private;
int i, ret = 0;
- if (mddev_congested(mddev, bits))
- return 1;
-
rcu_read_lock();
for (i = 0; i < mddev->raid_disks ; i++) {
struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
@@ -403,7 +399,7 @@ static int multipath_run (struct mddev *mddev)
/*
* copy the already verified devices into our private MULTIPATH
* bookkeeping area. [whatever we allocate in multipath_run(),
- * should be freed in multipath_stop()]
+ * should be freed in multipath_free()]
*/
conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
@@ -489,9 +485,6 @@ static int multipath_run (struct mddev *mddev)
*/
md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
- mddev->queue->backing_dev_info.congested_fn = multipath_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
-
if (md_integrity_register(mddev))
goto out_free_conf;
@@ -507,17 +500,13 @@ out:
return -EIO;
}
-static int multipath_stop (struct mddev *mddev)
+static void multipath_free(struct mddev *mddev, void *priv)
{
- struct mpconf *conf = mddev->private;
+ struct mpconf *conf = priv;
- md_unregister_thread(&mddev->thread);
- blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
mempool_destroy(conf->pool);
kfree(conf->multipaths);
kfree(conf);
- mddev->private = NULL;
- return 0;
}
static struct md_personality multipath_personality =
@@ -527,12 +516,13 @@ static struct md_personality multipath_personality =
.owner = THIS_MODULE,
.make_request = multipath_make_request,
.run = multipath_run,
- .stop = multipath_stop,
+ .free = multipath_free,
.status = multipath_status,
.error_handler = multipath_error,
.hot_add_disk = multipath_add_disk,
.hot_remove_disk= multipath_remove_disk,
.size = multipath_size,
+ .congested = multipath_congested,
};
static int __init multipath_init (void)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ba6b85de96d2..a13f738a7b39 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -25,17 +25,13 @@
#include "raid0.h"
#include "raid5.h"
-static int raid0_congested(void *data, int bits)
+static int raid0_congested(struct mddev *mddev, int bits)
{
- struct mddev *mddev = data;
struct r0conf *conf = mddev->private;
struct md_rdev **devlist = conf->devlist;
int raid_disks = conf->strip_zone[0].nb_dev;
int i, ret = 0;
- if (mddev_congested(mddev, bits))
- return 1;
-
for (i = 0; i < raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
@@ -263,8 +259,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
mdname(mddev),
(unsigned long long)smallest->sectors);
}
- mddev->queue->backing_dev_info.congested_fn = raid0_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
/*
* now since we have the hard sector sizes, we can make sure
@@ -356,17 +350,16 @@ static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
/**
* raid0_mergeable_bvec -- tell bio layer if two requests can be merged
- * @q: request queue
+ * @mddev: the md device
* @bvm: properties of new bio
* @biovec: the request that could be merged to it.
*
* Return amount of bytes we can accept at this offset
*/
-static int raid0_mergeable_bvec(struct request_queue *q,
+static int raid0_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
struct r0conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
sector_t sector_offset = sector;
@@ -422,7 +415,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
return array_sectors;
}
-static int raid0_stop(struct mddev *mddev);
+static void raid0_free(struct mddev *mddev, void *priv);
static int raid0_run(struct mddev *mddev)
{
@@ -471,26 +464,22 @@ static int raid0_run(struct mddev *mddev)
mddev->queue->backing_dev_info.ra_pages = 2* stripe;
}
- blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
dump_zones(mddev);
ret = md_integrity_register(mddev);
if (ret)
- raid0_stop(mddev);
+ raid0_free(mddev, conf);
return ret;
}
-static int raid0_stop(struct mddev *mddev)
+static void raid0_free(struct mddev *mddev, void *priv)
{
- struct r0conf *conf = mddev->private;
+ struct r0conf *conf = priv;
- blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
kfree(conf->strip_zone);
kfree(conf->devlist);
kfree(conf);
- mddev->private = NULL;
- return 0;
}
/*
@@ -724,11 +713,13 @@ static struct md_personality raid0_personality=
.owner = THIS_MODULE,
.make_request = raid0_make_request,
.run = raid0_run,
- .stop = raid0_stop,
+ .free = raid0_free,
.status = raid0_status,
.size = raid0_size,
.takeover = raid0_takeover,
.quiesce = raid0_quiesce,
+ .congested = raid0_congested,
+ .mergeable_bvec = raid0_mergeable_bvec,
};
static int __init raid0_init (void)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 40b35be34f8d..5dd0c2e59ab9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -701,11 +701,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk;
}
-static int raid1_mergeable_bvec(struct request_queue *q,
+static int raid1_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
struct r1conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max = biovec->bv_len;
@@ -734,7 +733,7 @@ static int raid1_mergeable_bvec(struct request_queue *q,
}
-int md_raid1_congested(struct mddev *mddev, int bits)
+static int raid1_congested(struct mddev *mddev, int bits)
{
struct r1conf *conf = mddev->private;
int i, ret = 0;
@@ -763,15 +762,6 @@ int md_raid1_congested(struct mddev *mddev, int bits)
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(md_raid1_congested);
-
-static int raid1_congested(void *data, int bits)
-{
- struct mddev *mddev = data;
-
- return mddev_congested(mddev, bits) ||
- md_raid1_congested(mddev, bits);
-}
static void flush_pending_writes(struct r1conf *conf)
{
@@ -2882,7 +2872,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
return ERR_PTR(err);
}
-static int stop(struct mddev *mddev);
+static void raid1_free(struct mddev *mddev, void *priv);
static int run(struct mddev *mddev)
{
struct r1conf *conf;
@@ -2904,7 +2894,7 @@ static int run(struct mddev *mddev)
/*
* copy the already verified devices into our private RAID1
* bookkeeping area. [whatever we allocate in run(),
- * should be freed in stop()]
+ * should be freed in raid1_free()]
*/
if (mddev->private == NULL)
conf = setup_conf(mddev);
@@ -2955,10 +2945,6 @@ static int run(struct mddev *mddev)
md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
if (mddev->queue) {
- mddev->queue->backing_dev_info.congested_fn = raid1_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
- blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
-
if (discard_supported)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
mddev->queue);
@@ -2968,37 +2954,23 @@ static int run(struct mddev *mddev)
}
ret = md_integrity_register(mddev);
- if (ret)
- stop(mddev);
+ if (ret) {
+ md_unregister_thread(&mddev->thread);
+ raid1_free(mddev, conf);
+ }
return ret;
}
-static int stop(struct mddev *mddev)
+static void raid1_free(struct mddev *mddev, void *priv)
{
- struct r1conf *conf = mddev->private;
- struct bitmap *bitmap = mddev->bitmap;
+ struct r1conf *conf = priv;
- /* wait for behind writes to complete */
- if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
- printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n",
- mdname(mddev));
- /* need to kick something here to make sure I/O goes? */
- wait_event(bitmap->behind_wait,
- atomic_read(&bitmap->behind_writes) == 0);
- }
-
- freeze_array(conf, 0);
- unfreeze_array(conf);
-
- md_unregister_thread(&mddev->thread);
if (conf->r1bio_pool)
mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
kfree(conf);
- mddev->private = NULL;
- return 0;
}
static int raid1_resize(struct mddev *mddev, sector_t sectors)
@@ -3181,7 +3153,7 @@ static struct md_personality raid1_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid1_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid1_add_disk,
@@ -3193,6 +3165,8 @@ static struct md_personality raid1_personality =
.check_reshape = raid1_reshape,
.quiesce = raid1_quiesce,
.takeover = raid1_takeover,
+ .congested = raid1_congested,
+ .mergeable_bvec = raid1_mergeable_bvec,
};
static int __init raid_init(void)
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 33bda55ef9f7..14ebb288c1ef 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -170,7 +170,4 @@ struct r1bio {
*/
#define R1BIO_MadeGood 7
#define R1BIO_WriteError 8
-
-extern int md_raid1_congested(struct mddev *mddev, int bits);
-
#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 32e282f4c83c..b8d76b1fba64 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -674,7 +674,7 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
/**
* raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
- * @q: request queue
+ * @mddev: the md device
* @bvm: properties of new bio
* @biovec: the request that could be merged to it.
*
@@ -682,11 +682,10 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
* This requires checking for end-of-chunk if near_copies != raid_disks,
* and for subordinate merge_bvec_fns if merge_check_needed.
*/
-static int raid10_mergeable_bvec(struct request_queue *q,
+static int raid10_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
struct r10conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
@@ -910,7 +909,7 @@ retry:
return rdev;
}
-int md_raid10_congested(struct mddev *mddev, int bits)
+static int raid10_congested(struct mddev *mddev, int bits)
{
struct r10conf *conf = mddev->private;
int i, ret = 0;
@@ -934,15 +933,6 @@ int md_raid10_congested(struct mddev *mddev, int bits)
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(md_raid10_congested);
-
-static int raid10_congested(void *data, int bits)
-{
- struct mddev *mddev = data;
-
- return mddev_congested(mddev, bits) ||
- md_raid10_congested(mddev, bits);
-}
static void flush_pending_writes(struct r10conf *conf)
{
@@ -3757,8 +3747,6 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
int stripe = conf->geo.raid_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE);
- mddev->queue->backing_dev_info.congested_fn = raid10_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
/* Calculate max read-ahead size.
* We need to readahead at least twice a whole stripe....
@@ -3767,7 +3755,6 @@ static int run(struct mddev *mddev)
stripe /= conf->geo.near_copies;
if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
- blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
}
if (md_integrity_register(mddev))
@@ -3811,17 +3798,9 @@ out:
return -EIO;
}
-static int stop(struct mddev *mddev)
+static void raid10_free(struct mddev *mddev, void *priv)
{
- struct r10conf *conf = mddev->private;
-
- raise_barrier(conf, 0);
- lower_barrier(conf);
-
- md_unregister_thread(&mddev->thread);
- if (mddev->queue)
- /* the unplug fn references 'conf'*/
- blk_sync_queue(mddev->queue);
+ struct r10conf *conf = priv;
if (conf->r10bio_pool)
mempool_destroy(conf->r10bio_pool);
@@ -3830,8 +3809,6 @@ static int stop(struct mddev *mddev)
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
kfree(conf);
- mddev->private = NULL;
- return 0;
}
static void raid10_quiesce(struct mddev *mddev, int state)
@@ -3895,7 +3872,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
return 0;
}
-static void *raid10_takeover_raid0(struct mddev *mddev)
+static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
{
struct md_rdev *rdev;
struct r10conf *conf;
@@ -3905,6 +3882,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
mdname(mddev));
return ERR_PTR(-EINVAL);
}
+ sector_div(size, devs);
/* Set new parameters */
mddev->new_level = 10;
@@ -3915,12 +3893,15 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
mddev->raid_disks *= 2;
/* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector;
+ mddev->dev_sectors = size;
conf = setup_conf(mddev);
if (!IS_ERR(conf)) {
rdev_for_each(rdev, mddev)
- if (rdev->raid_disk >= 0)
+ if (rdev->raid_disk >= 0) {
rdev->new_raid_disk = rdev->raid_disk * 2;
+ rdev->sectors = size;
+ }
conf->barrier = 1;
}
@@ -3943,7 +3924,9 @@ static void *raid10_takeover(struct mddev *mddev)
mdname(mddev));
return ERR_PTR(-EINVAL);
}
- return raid10_takeover_raid0(mddev);
+ return raid10_takeover_raid0(mddev,
+ raid0_conf->strip_zone->zone_end,
+ raid0_conf->strip_zone->nb_dev);
}
return ERR_PTR(-EINVAL);
}
@@ -4713,7 +4696,7 @@ static struct md_personality raid10_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid10_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid10_add_disk,
@@ -4727,6 +4710,8 @@ static struct md_personality raid10_personality =
.check_reshape = raid10_check_reshape,
.start_reshape = raid10_start_reshape,
.finish_reshape = raid10_finish_reshape,
+ .congested = raid10_congested,
+ .mergeable_bvec = raid10_mergeable_bvec,
};
static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 157d69e83ff4..5ee6473ddc2c 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -150,7 +150,4 @@ enum r10bio_state {
*/
R10BIO_Previous,
};
-
-extern int md_raid10_congested(struct mddev *mddev, int bits);
-
#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b98765f6f77f..aa76865b804b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -296,12 +296,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(atomic_read(&conf->active_stripes)==0);
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state) &&
- !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
list_add_tail(&sh->lru, &conf->delayed_list);
- if (atomic_read(&conf->preread_active_stripes)
- < IO_THRESHOLD)
- md_wakeup_thread(conf->mddev->thread);
- } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
sh->bm_seq - conf->seq_write > 0)
list_add_tail(&sh->lru, &conf->bitmap_list);
else {
@@ -2898,31 +2895,102 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
* Returns 1 when no more member devices need to be checked, otherwise returns
* 0 to tell the loop in handle_stripe_fill to continue
*/
-static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
- int disk_idx, int disks)
+
+static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
+ int disk_idx, int disks)
{
struct r5dev *dev = &sh->dev[disk_idx];
struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
&sh->dev[s->failed_num[1]] };
+ int i;
+
+
+ if (test_bit(R5_LOCKED, &dev->flags) ||
+ test_bit(R5_UPTODATE, &dev->flags))
+ /* No point reading this as we already have it or have
+ * decided to get it.
+ */
+ return 0;
+
+ if (dev->toread ||
+ (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
+ /* We need this block to directly satisfy a request */
+ return 1;
+
+ if (s->syncing || s->expanding ||
+ (s->replacing && want_replace(sh, disk_idx)))
+ /* When syncing, or expanding we read everything.
+ * When replacing, we need the replaced block.
+ */
+ return 1;
+
+ if ((s->failed >= 1 && fdev[0]->toread) ||
+ (s->failed >= 2 && fdev[1]->toread))
+ /* If we want to read from a failed device, then
+ * we need to actually read every other device.
+ */
+ return 1;
+
+ /* Sometimes neither read-modify-write nor reconstruct-write
+ * cycles can work. In those cases we read every block we
+ * can. Then the parity-update is certain to have enough to
+ * work with.
+ * This can only be a problem when we need to write something,
+ * and some device has failed. If either of those tests
+ * fail we need look no further.
+ */
+ if (!s->failed || !s->to_write)
+ return 0;
+
+ if (test_bit(R5_Insync, &dev->flags) &&
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ /* Pre-reads at not permitted until after short delay
+ * to gather multiple requests. However if this
+ * device is no Insync, the block could only be be computed
+ * and there is no need to delay that.
+ */
+ return 0;
+
+ for (i = 0; i < s->failed; i++) {
+ if (fdev[i]->towrite &&
+ !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+ !test_bit(R5_OVERWRITE, &fdev[i]->flags))
+ /* If we have a partial write to a failed
+ * device, then we will need to reconstruct
+ * the content of that device, so all other
+ * devices must be read.
+ */
+ return 1;
+ }
+
+ /* If we are forced to do a reconstruct-write, either because
+ * the current RAID6 implementation only supports that, or
+ * or because parity cannot be trusted and we are currently
+ * recovering it, there is extra need to be careful.
+ * If one of the devices that we would need to read, because
+ * it is not being overwritten (and maybe not written at all)
+ * is missing/faulty, then we need to read everything we can.
+ */
+ if (sh->raid_conf->level != 6 &&
+ sh->sector < sh->raid_conf->mddev->recovery_cp)
+ /* reconstruct-write isn't being forced */
+ return 0;
+ for (i = 0; i < s->failed; i++) {
+ if (!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+ !test_bit(R5_OVERWRITE, &fdev[i]->flags))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
+ int disk_idx, int disks)
+{
+ struct r5dev *dev = &sh->dev[disk_idx];
/* is the data in this block needed, and can we get it? */
- if (!test_bit(R5_LOCKED, &dev->flags) &&
- !test_bit(R5_UPTODATE, &dev->flags) &&
- (dev->toread ||
- (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
- s->syncing || s->expanding ||
- (s->replacing && want_replace(sh, disk_idx)) ||
- (s->failed >= 1 && fdev[0]->toread) ||
- (s->failed >= 2 && fdev[1]->toread) ||
- (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
- (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
- !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
- ((sh->raid_conf->level == 6 ||
- sh->sector >= sh->raid_conf->mddev->recovery_cp)
- && s->failed && s->to_write &&
- (s->to_write - s->non_overwrite <
- sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) &&
- (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
+ if (need_this_block(sh, s, disk_idx, disks)) {
/* we would like to get this block, possibly by computing it,
* otherwise read it if the backing disk is insync
*/
@@ -4081,7 +4149,7 @@ static void activate_bit_delay(struct r5conf *conf,
}
}
-int md_raid5_congested(struct mddev *mddev, int bits)
+static int raid5_congested(struct mddev *mddev, int bits)
{
struct r5conf *conf = mddev->private;
@@ -4098,24 +4166,14 @@ int md_raid5_congested(struct mddev *mddev, int bits)
return 0;
}
-EXPORT_SYMBOL_GPL(md_raid5_congested);
-
-static int raid5_congested(void *data, int bits)
-{
- struct mddev *mddev = data;
-
- return mddev_congested(mddev, bits) ||
- md_raid5_congested(mddev, bits);
-}
/* We want read requests to align with chunks where possible,
* but write requests don't need to.
*/
-static int raid5_mergeable_bvec(struct request_queue *q,
+static int raid5_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -5296,11 +5354,14 @@ static void raid5d(struct md_thread *thread)
static ssize_t
raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->max_nr_stripes);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+ spin_unlock(&mddev->lock);
+ return ret;
}
int
@@ -5339,21 +5400,25 @@ EXPORT_SYMBOL(raid5_set_cache_size);
static ssize_t
raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
int err;
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
- err = raid5_set_cache_size(mddev, new);
+ err = mddev_lock(mddev);
if (err)
return err;
- return len;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else
+ err = raid5_set_cache_size(mddev, new);
+ mddev_unlock(mddev);
+
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -5364,29 +5429,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
static ssize_t
raid5_show_preread_threshold(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->bypass_threshold);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->bypass_threshold);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
+ int err;
+
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
- if (new > conf->max_nr_stripes)
- return -EINVAL;
- conf->bypass_threshold = new;
- return len;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new > conf->max_nr_stripes)
+ err = -EINVAL;
+ else
+ conf->bypass_threshold = new;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -5398,39 +5474,48 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
static ssize_t
raid5_show_skip_copy(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->skip_copy);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->skip_copy);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
+ int err;
+
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
new = !!new;
- if (new == conf->skip_copy)
- return len;
- mddev_suspend(mddev);
- conf->skip_copy = new;
- if (new)
- mddev->queue->backing_dev_info.capabilities |=
- BDI_CAP_STABLE_WRITES;
- else
- mddev->queue->backing_dev_info.capabilities &=
- ~BDI_CAP_STABLE_WRITES;
- mddev_resume(mddev);
- return len;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new != conf->skip_copy) {
+ mddev_suspend(mddev);
+ conf->skip_copy = new;
+ if (new)
+ mddev->queue->backing_dev_info.capabilities |=
+ BDI_CAP_STABLE_WRITES;
+ else
+ mddev->queue->backing_dev_info.capabilities &=
+ ~BDI_CAP_STABLE_WRITES;
+ mddev_resume(mddev);
+ }
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -5454,11 +5539,14 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
static ssize_t
raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->worker_cnt_per_group);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static int alloc_thread_groups(struct r5conf *conf, int cnt,
@@ -5468,7 +5556,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
static ssize_t
raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
int err;
struct r5worker_group *new_groups, *old_groups;
@@ -5476,41 +5564,41 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
- if (new == conf->worker_cnt_per_group)
- return len;
-
- mddev_suspend(mddev);
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new != conf->worker_cnt_per_group) {
+ mddev_suspend(mddev);
- old_groups = conf->worker_groups;
- if (old_groups)
- flush_workqueue(raid5_wq);
+ old_groups = conf->worker_groups;
+ if (old_groups)
+ flush_workqueue(raid5_wq);
- err = alloc_thread_groups(conf, new,
- &group_cnt, &worker_cnt_per_group,
- &new_groups);
- if (!err) {
- spin_lock_irq(&conf->device_lock);
- conf->group_cnt = group_cnt;
- conf->worker_cnt_per_group = worker_cnt_per_group;
- conf->worker_groups = new_groups;
- spin_unlock_irq(&conf->device_lock);
+ err = alloc_thread_groups(conf, new,
+ &group_cnt, &worker_cnt_per_group,
+ &new_groups);
+ if (!err) {
+ spin_lock_irq(&conf->device_lock);
+ conf->group_cnt = group_cnt;
+ conf->worker_cnt_per_group = worker_cnt_per_group;
+ conf->worker_groups = new_groups;
+ spin_unlock_irq(&conf->device_lock);
- if (old_groups)
- kfree(old_groups[0].workers);
- kfree(old_groups);
+ if (old_groups)
+ kfree(old_groups[0].workers);
+ kfree(old_groups);
+ }
+ mddev_resume(mddev);
}
+ mddev_unlock(mddev);
- mddev_resume(mddev);
-
- if (err)
- return err;
- return len;
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -6178,11 +6266,6 @@ static int run(struct mddev *mddev)
if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
- blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
-
- mddev->queue->backing_dev_info.congested_data = mddev;
- mddev->queue->backing_dev_info.congested_fn = raid5_congested;
-
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
blk_queue_io_opt(mddev->queue, chunk_size *
@@ -6260,17 +6343,12 @@ abort:
return -EIO;
}
-static int stop(struct mddev *mddev)
+static void raid5_free(struct mddev *mddev, void *priv)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf = priv;
- md_unregister_thread(&mddev->thread);
- if (mddev->queue)
- mddev->queue->backing_dev_info.congested_fn = NULL;
free_conf(conf);
- mddev->private = NULL;
mddev->to_remove = &raid5_attrs_group;
- return 0;
}
static void status(struct seq_file *seq, struct mddev *mddev)
@@ -7044,7 +7122,7 @@ static struct md_personality raid6_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid5_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid5_add_disk,
@@ -7058,6 +7136,8 @@ static struct md_personality raid6_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
};
static struct md_personality raid5_personality =
{
@@ -7066,7 +7146,7 @@ static struct md_personality raid5_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid5_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid5_add_disk,
@@ -7080,6 +7160,8 @@ static struct md_personality raid5_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
};
static struct md_personality raid4_personality =
@@ -7089,7 +7171,7 @@ static struct md_personality raid4_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid5_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid5_add_disk,
@@ -7103,6 +7185,8 @@ static struct md_personality raid4_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
};
static int __init raid5_init(void)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index d59f5ca743cd..983e18a83db1 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -558,7 +558,6 @@ static inline int algorithm_is_DDF(int layout)
return layout >= 8 && layout <= 10;
}
-extern int md_raid5_congested(struct mddev *mddev, int bits);
extern void md_raid5_kick_device(struct r5conf *conf);
extern int raid5_set_cache_size(struct mddev *mddev, int size);
#endif
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 53563955931b..55fa27ecf4e1 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -49,7 +49,6 @@ static DEFINE_MUTEX(mtd_mutex);
*/
struct mtd_file_info {
struct mtd_info *mtd;
- struct inode *ino;
enum mtd_file_modes mode;
};
@@ -59,10 +58,6 @@ static loff_t mtdchar_lseek(struct file *file, loff_t offset, int orig)
return fixed_size_llseek(file, offset, orig, mfi->mtd->size);
}
-static int count;
-static struct vfsmount *mnt;
-static struct file_system_type mtd_inodefs_type;
-
static int mtdchar_open(struct inode *inode, struct file *file)
{
int minor = iminor(inode);
@@ -70,7 +65,6 @@ static int mtdchar_open(struct inode *inode, struct file *file)
int ret = 0;
struct mtd_info *mtd;
struct mtd_file_info *mfi;
- struct inode *mtd_ino;
pr_debug("MTD_open\n");
@@ -78,10 +72,6 @@ static int mtdchar_open(struct inode *inode, struct file *file)
if ((file->f_mode & FMODE_WRITE) && (minor & 1))
return -EACCES;
- ret = simple_pin_fs(&mtd_inodefs_type, &mnt, &count);
- if (ret)
- return ret;
-
mutex_lock(&mtd_mutex);
mtd = get_mtd_device(NULL, devnum);
@@ -95,43 +85,26 @@ static int mtdchar_open(struct inode *inode, struct file *file)
goto out1;
}
- mtd_ino = iget_locked(mnt->mnt_sb, devnum);
- if (!mtd_ino) {
- ret = -ENOMEM;
- goto out1;
- }
- if (mtd_ino->i_state & I_NEW) {
- mtd_ino->i_private = mtd;
- mtd_ino->i_mode = S_IFCHR;
- mtd_ino->i_data.backing_dev_info = mtd->backing_dev_info;
- unlock_new_inode(mtd_ino);
- }
- file->f_mapping = mtd_ino->i_mapping;
-
/* You can't open it RW if it's not a writeable device */
if ((file->f_mode & FMODE_WRITE) && !(mtd->flags & MTD_WRITEABLE)) {
ret = -EACCES;
- goto out2;
+ goto out1;
}
mfi = kzalloc(sizeof(*mfi), GFP_KERNEL);
if (!mfi) {
ret = -ENOMEM;
- goto out2;
+ goto out1;
}
- mfi->ino = mtd_ino;
mfi->mtd = mtd;
file->private_data = mfi;
mutex_unlock(&mtd_mutex);
return 0;
-out2:
- iput(mtd_ino);
out1:
put_mtd_device(mtd);
out:
mutex_unlock(&mtd_mutex);
- simple_release_fs(&mnt, &count);
return ret;
} /* mtdchar_open */
@@ -148,12 +121,9 @@ static int mtdchar_close(struct inode *inode, struct file *file)
if ((file->f_mode & FMODE_WRITE))
mtd_sync(mtd);
- iput(mfi->ino);
-
put_mtd_device(mtd);
file->private_data = NULL;
kfree(mfi);
- simple_release_fs(&mnt, &count);
return 0;
} /* mtdchar_close */
@@ -1117,6 +1087,13 @@ static unsigned long mtdchar_get_unmapped_area(struct file *file,
ret = mtd_get_unmapped_area(mtd, len, offset, flags);
return ret == -EOPNOTSUPP ? -ENODEV : ret;
}
+
+static unsigned mtdchar_mmap_capabilities(struct file *file)
+{
+ struct mtd_file_info *mfi = file->private_data;
+
+ return mtd_mmap_capabilities(mfi->mtd);
+}
#endif
/*
@@ -1160,27 +1137,10 @@ static const struct file_operations mtd_fops = {
.mmap = mtdchar_mmap,
#ifndef CONFIG_MMU
.get_unmapped_area = mtdchar_get_unmapped_area,
+ .mmap_capabilities = mtdchar_mmap_capabilities,
#endif
};
-static const struct super_operations mtd_ops = {
- .drop_inode = generic_delete_inode,
- .statfs = simple_statfs,
-};
-
-static struct dentry *mtd_inodefs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_pseudo(fs_type, "mtd_inode:", &mtd_ops, NULL, MTD_INODE_FS_MAGIC);
-}
-
-static struct file_system_type mtd_inodefs_type = {
- .name = "mtd_inodefs",
- .mount = mtd_inodefs_mount,
- .kill_sb = kill_anon_super,
-};
-MODULE_ALIAS_FS("mtd_inodefs");
-
int __init init_mtdchar(void)
{
int ret;
@@ -1193,23 +1153,11 @@ int __init init_mtdchar(void)
return ret;
}
- ret = register_filesystem(&mtd_inodefs_type);
- if (ret) {
- pr_err("Can't register mtd_inodefs filesystem, error %d\n",
- ret);
- goto err_unregister_chdev;
- }
-
- return ret;
-
-err_unregister_chdev:
- __unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
return ret;
}
void __exit cleanup_mtdchar(void)
{
- unregister_filesystem(&mtd_inodefs_type);
__unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
}
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index b9000563b9f4..eacc3aac7327 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -732,8 +732,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c
concat->mtd.ecc_stats.badblocks = subdev[0]->ecc_stats.badblocks;
- concat->mtd.backing_dev_info = subdev[0]->backing_dev_info;
-
concat->subdev[0] = subdev[0];
for (i = 1; i < num_devs; i++) {
@@ -761,14 +759,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c
subdev[i]->flags & MTD_WRITEABLE;
}
- /* only permit direct mapping if the BDIs are all the same
- * - copy-mapping is still permitted
- */
- if (concat->mtd.backing_dev_info !=
- subdev[i]->backing_dev_info)
- concat->mtd.backing_dev_info =
- &default_backing_dev_info;
-
concat->mtd.size += subdev[i]->size;
concat->mtd.ecc_stats.badblocks +=
subdev[i]->ecc_stats.badblocks;
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 4c611871d7e6..0ec4d6ea1e4b 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -43,33 +43,7 @@
#include "mtdcore.h"
-/*
- * backing device capabilities for non-mappable devices (such as NAND flash)
- * - permits private mappings, copies are taken of the data
- */
-static struct backing_dev_info mtd_bdi_unmappable = {
- .capabilities = BDI_CAP_MAP_COPY,
-};
-
-/*
- * backing device capabilities for R/O mappable devices (such as ROM)
- * - permits private mappings, copies are taken of the data
- * - permits non-writable shared mappings
- */
-static struct backing_dev_info mtd_bdi_ro_mappable = {
- .capabilities = (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
- BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP),
-};
-
-/*
- * backing device capabilities for writable mappable devices (such as RAM)
- * - permits private mappings, copies are taken of the data
- * - permits non-writable shared mappings
- */
-static struct backing_dev_info mtd_bdi_rw_mappable = {
- .capabilities = (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
- BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP |
- BDI_CAP_WRITE_MAP),
+static struct backing_dev_info mtd_bdi = {
};
static int mtd_cls_suspend(struct device *dev, pm_message_t state);
@@ -365,6 +339,23 @@ static struct device_type mtd_devtype = {
.release = mtd_release,
};
+#ifndef CONFIG_MMU
+unsigned mtd_mmap_capabilities(struct mtd_info *mtd)
+{
+ switch (mtd->type) {
+ case MTD_RAM:
+ return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC |
+ NOMMU_MAP_READ | NOMMU_MAP_WRITE;
+ case MTD_ROM:
+ return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC |
+ NOMMU_MAP_READ;
+ default:
+ return NOMMU_MAP_COPY;
+ }
+}
+EXPORT_SYMBOL_GPL(mtd_mmap_capabilities);
+#endif
+
/**
* add_mtd_device - register an MTD device
* @mtd: pointer to new MTD device info structure
@@ -380,19 +371,7 @@ int add_mtd_device(struct mtd_info *mtd)
struct mtd_notifier *not;
int i, error;
- if (!mtd->backing_dev_info) {
- switch (mtd->type) {
- case MTD_RAM:
- mtd->backing_dev_info = &mtd_bdi_rw_mappable;
- break;
- case MTD_ROM:
- mtd->backing_dev_info = &mtd_bdi_ro_mappable;
- break;
- default:
- mtd->backing_dev_info = &mtd_bdi_unmappable;
- break;
- }
- }
+ mtd->backing_dev_info = &mtd_bdi;
BUG_ON(mtd->writesize == 0);
mutex_lock(&mtd_table_mutex);
@@ -1237,17 +1216,9 @@ static int __init init_mtd(void)
if (ret)
goto err_reg;
- ret = mtd_bdi_init(&mtd_bdi_unmappable, "mtd-unmap");
- if (ret)
- goto err_bdi1;
-
- ret = mtd_bdi_init(&mtd_bdi_ro_mappable, "mtd-romap");
- if (ret)
- goto err_bdi2;
-
- ret = mtd_bdi_init(&mtd_bdi_rw_mappable, "mtd-rwmap");
+ ret = mtd_bdi_init(&mtd_bdi, "mtd");
if (ret)
- goto err_bdi3;
+ goto err_bdi;
proc_mtd = proc_create("mtd", 0, NULL, &mtd_proc_ops);
@@ -1260,11 +1231,7 @@ static int __init init_mtd(void)
out_procfs:
if (proc_mtd)
remove_proc_entry("mtd", NULL);
-err_bdi3:
- bdi_destroy(&mtd_bdi_ro_mappable);
-err_bdi2:
- bdi_destroy(&mtd_bdi_unmappable);
-err_bdi1:
+err_bdi:
class_unregister(&mtd_class);
err_reg:
pr_err("Error registering mtd class or bdi: %d\n", ret);
@@ -1277,9 +1244,7 @@ static void __exit cleanup_mtd(void)
if (proc_mtd)
remove_proc_entry("mtd", NULL);
class_unregister(&mtd_class);
- bdi_destroy(&mtd_bdi_unmappable);
- bdi_destroy(&mtd_bdi_ro_mappable);
- bdi_destroy(&mtd_bdi_rw_mappable);
+ bdi_destroy(&mtd_bdi);
}
module_init(init_mtd);
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index a3e3a7d074d5..e779de315ade 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -378,7 +378,6 @@ static struct mtd_part *allocate_partition(struct mtd_info *master,
slave->mtd.name = name;
slave->mtd.owner = master->owner;
- slave->mtd.backing_dev_info = master->backing_dev_info;
/* NOTE: we don't arrange MTDs as a tree; it'd be error-prone
* to have the same data be in two different partitions.
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 7f900229404d..96128cb009f3 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -28,8 +28,8 @@
static int dcssblk_open(struct block_device *bdev, fmode_t mode);
static void dcssblk_release(struct gendisk *disk, fmode_t mode);
static void dcssblk_make_request(struct request_queue *q, struct bio *bio);
-static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
- void **kaddr, unsigned long *pfn);
+static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
+ void **kaddr, unsigned long *pfn, long size);
static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
@@ -877,25 +877,22 @@ fail:
bio_io_error(bio);
}
-static int
+static long
dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
- void **kaddr, unsigned long *pfn)
+ void **kaddr, unsigned long *pfn, long size)
{
struct dcssblk_dev_info *dev_info;
- unsigned long pgoff;
+ unsigned long offset, dev_sz;
dev_info = bdev->bd_disk->private_data;
if (!dev_info)
return -ENODEV;
- if (secnum % (PAGE_SIZE/512))
- return -EINVAL;
- pgoff = secnum / (PAGE_SIZE / 512);
- if ((pgoff+1)*PAGE_SIZE-1 > dev_info->end - dev_info->start)
- return -ERANGE;
- *kaddr = (void *) (dev_info->start+pgoff*PAGE_SIZE);
+ dev_sz = dev_info->end - dev_info->start;
+ offset = secnum * 512;
+ *kaddr = (void *) (dev_info->start + offset);
*pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT;
- return 0;
+ return dev_sz - offset;
}
static void
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 17bb541f7cc2..54d7a6cbb98a 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2197,6 +2197,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
shost->tag_set.cmd_size = cmd_size;
shost->tag_set.numa_node = NUMA_NO_NODE;
shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ shost->tag_set.flags |=
+ BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
shost->tag_set.driver_data = shost;
return blk_mq_alloc_tag_set(&shost->tag_set);
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 0deb385ad4d6..9c0a520d933c 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -277,7 +277,8 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
if (!shost_use_blk_mq(sdev->host) &&
(shost->bqt || shost->hostt->use_blk_tags)) {
blk_queue_init_tags(sdev->request_queue,
- sdev->host->cmd_per_lun, shost->bqt);
+ sdev->host->cmd_per_lun, shost->bqt,
+ shost->hostt->tag_alloc_policy);
}
scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index a668c88ea150..0cbc1fb45f10 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1719,22 +1719,19 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
}
if (iov_count) {
- int len, size = sizeof(struct sg_iovec) * iov_count;
+ int size = sizeof(struct iovec) * iov_count;
struct iovec *iov;
+ struct iov_iter i;
iov = memdup_user(hp->dxferp, size);
if (IS_ERR(iov))
return PTR_ERR(iov);
- len = iov_length(iov, iov_count);
- if (hp->dxfer_len < len) {
- iov_count = iov_shorten(iov, iov_count, hp->dxfer_len);
- len = hp->dxfer_len;
- }
+ iov_iter_init(&i, rw, iov, iov_count,
+ min_t(size_t, hp->dxfer_len,
+ iov_length(iov, iov_count)));
- res = blk_rq_map_user_iov(q, rq, md, (struct sg_iovec *)iov,
- iov_count,
- len, GFP_ATOMIC);
+ res = blk_rq_map_user_iov(q, rq, md, &i, GFP_ATOMIC);
kfree(iov);
} else
res = blk_rq_map_user(q, rq, md, hp->dxferp,
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index a3367bfb1456..45aaa1cc56bc 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -987,7 +987,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
if (err)
goto out_free;
lsi->lsi_flags |= LSI_BDI_INITIALIZED;
- lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+ lsi->lsi_bdi.capabilities = 0;
err = ll_bdi_register(&lsi->lsi_bdi);
if (err)
goto out_free;
@@ -1812,10 +1812,6 @@ void ll_read_inode2(struct inode *inode, void *opaque)
/* OIDEBUG(inode); */
- /* initializing backing dev info. */
- inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi;
-
-
if (S_ISREG(inode->i_mode)) {
struct ll_sb_info *sbi = ll_i2sbi(inode);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6894b085f0ee..620d93489539 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -335,7 +335,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
}
init_rwsem(&v9ses->rename_sem);
- rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
+ rc = bdi_setup_and_register(&v9ses->bdi, "9p");
if (rc) {
kfree(v9ses->aname);
kfree(v9ses->uname);
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 2b607257820c..d142a2449e65 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,7 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
volume->cell = params->cell;
volume->vid = vlocation->vldb.vid[params->type];
- ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
+ ret = bdi_setup_and_register(&volume->bdi, "afs");
if (ret)
goto error_bdi;
diff --git a/fs/aio.c b/fs/aio.c
index c428871f1093..118a2e0088d8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -165,15 +165,6 @@ static struct vfsmount *aio_mnt;
static const struct file_operations aio_ring_fops;
static const struct address_space_operations aio_ctx_aops;
-/* Backing dev info for aio fs.
- * -no dirty page accounting or writeback happens
- */
-static struct backing_dev_info aio_fs_backing_dev_info = {
- .name = "aiofs",
- .state = 0,
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
-};
-
static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
{
struct qstr this = QSTR_INIT("[aio]", 5);
@@ -185,7 +176,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
inode->i_mapping->a_ops = &aio_ctx_aops;
inode->i_mapping->private_data = ctx;
- inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
inode->i_size = PAGE_SIZE * nr_pages;
path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -230,9 +220,6 @@ static int __init aio_setup(void)
if (IS_ERR(aio_mnt))
panic("Failed to create aio fs mount.");
- if (bdi_init(&aio_fs_backing_dev_info))
- panic("Failed to init aio fs backing dev info.");
-
kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b48c41bf0f86..975266be67d3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -49,23 +49,15 @@ inline struct block_device *I_BDEV(struct inode *inode)
}
EXPORT_SYMBOL(I_BDEV);
-/*
- * Move the inode from its current bdi to a new bdi. Make sure the inode
- * is clean before moving so that it doesn't linger on the old bdi.
- */
-static void bdev_inode_switch_bdi(struct inode *inode,
- struct backing_dev_info *dst)
+static void bdev_write_inode(struct inode *inode)
{
- while (true) {
- spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_DIRTY)) {
- inode->i_data.backing_dev_info = dst;
- spin_unlock(&inode->i_lock);
- return;
- }
+ spin_lock(&inode->i_lock);
+ while (inode->i_state & I_DIRTY) {
spin_unlock(&inode->i_lock);
WARN_ON_ONCE(write_inode_now(inode, true));
+ spin_lock(&inode->i_lock);
}
+ spin_unlock(&inode->i_lock);
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
@@ -429,6 +421,46 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(bdev_write_page);
+/**
+ * bdev_direct_access() - Get the address for directly-accessibly memory
+ * @bdev: The device containing the memory
+ * @sector: The offset within the device
+ * @addr: Where to put the address of the memory
+ * @pfn: The Page Frame Number for the memory
+ * @size: The number of bytes requested
+ *
+ * If a block device is made up of directly addressable memory, this function
+ * will tell the caller the PFN and the address of the memory. The address
+ * may be directly dereferenced within the kernel without the need to call
+ * ioremap(), kmap() or similar. The PFN is suitable for inserting into
+ * page tables.
+ *
+ * Return: negative errno if an error occurs, otherwise the number of bytes
+ * accessible at this address.
+ */
+long bdev_direct_access(struct block_device *bdev, sector_t sector,
+ void **addr, unsigned long *pfn, long size)
+{
+ long avail;
+ const struct block_device_operations *ops = bdev->bd_disk->fops;
+
+ if (size < 0)
+ return size;
+ if (!ops->direct_access)
+ return -EOPNOTSUPP;
+ if ((sector + DIV_ROUND_UP(size, 512)) >
+ part_nr_sects_read(bdev->bd_part))
+ return -ERANGE;
+ sector += get_start_sect(bdev);
+ if (sector % (PAGE_SIZE / 512))
+ return -EINVAL;
+ avail = ops->direct_access(bdev, sector, addr, pfn, size);
+ if (!avail)
+ return -ERANGE;
+ return min(avail, size);
+}
+EXPORT_SYMBOL_GPL(bdev_direct_access);
+
/*
* pseudo-fs
*/
@@ -584,7 +616,6 @@ struct block_device *bdget(dev_t dev)
inode->i_bdev = bdev;
inode->i_data.a_ops = &def_blk_aops;
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
- inode->i_data.backing_dev_info = &default_backing_dev_info;
spin_lock(&bdev_lock);
list_add(&bdev->bd_list, &all_bdevs);
spin_unlock(&bdev_lock);
@@ -1145,8 +1176,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
if (!partno) {
- struct backing_dev_info *bdi;
-
ret = -ENXIO;
bdev->bd_part = disk_get_part(disk, partno);
if (!bdev->bd_part)
@@ -1172,11 +1201,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
}
- if (!ret) {
+ if (!ret)
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
- bdi = blk_get_backing_dev_info(bdev);
- bdev_inode_switch_bdi(bdev->bd_inode, bdi);
- }
/*
* If the device is invalidated, rescan partition
@@ -1203,8 +1229,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (ret)
goto out_clear;
bdev->bd_contains = whole;
- bdev_inode_switch_bdi(bdev->bd_inode,
- whole->bd_inode->i_data.backing_dev_info);
bdev->bd_part = disk_get_part(disk, partno);
if (!(disk->flags & GENHD_FL_UP) ||
!bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1244,7 +1268,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
bdev->bd_queue = NULL;
- bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
@@ -1464,11 +1487,11 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
WARN_ON_ONCE(bdev->bd_holders);
sync_blockdev(bdev);
kill_bdev(bdev);
- /* ->release can cause the old bdi to disappear,
- * so must switch it out first
+ /*
+ * ->release can cause the queue to disappear, so flush all
+ * dirty data before.
*/
- bdev_inode_switch_bdi(bdev->bd_inode,
- &default_backing_dev_info);
+ bdev_write_inode(bdev->bd_inode);
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8c63419a7f70..1afb18226da8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1715,12 +1715,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
{
int err;
- bdi->capabilities = BDI_CAP_MAP_COPY;
- err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
+ err = bdi_setup_and_register(bdi, "btrfs");
if (err)
return err;
- bdi->ra_pages = default_backing_dev_info.ra_pages;
+ bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
bdi->congested_fn = btrfs_congested_fn;
bdi->congested_data = info;
return 0;
@@ -2319,7 +2318,6 @@ int open_ctree(struct super_block *sb,
*/
fs_info->btree_inode->i_size = OFFSET_MAX;
fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
- fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a606ab551296..b78bbbac900d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1746,7 +1746,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
mutex_lock(&inode->i_mutex);
- current->backing_dev_info = inode->i_mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err) {
mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bf326affb94..54bcf639d1cf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3608,7 +3608,6 @@ cache_acl:
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
@@ -3623,7 +3622,6 @@ cache_acl:
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
inode->i_mapping->a_ops = &btrfs_symlink_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
break;
default:
inode->i_op = &btrfs_special_inode_operations;
@@ -6088,7 +6086,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
@@ -9203,7 +9200,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
@@ -9247,7 +9243,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &btrfs_symlink_inode_operations;
inode->i_mapping->a_ops = &btrfs_symlink_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
inode_set_bytes(inode, name_len);
btrfs_i_size_write(inode, name_len);
err = btrfs_update_inode(trans, root, inode);
@@ -9459,7 +9454,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
ret = btrfs_init_inode_security(trans, inode, dir, NULL);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce74b394b49d..905986dd4c3c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -945,7 +945,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */
- current->backing_dev_info = file->f_mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f61a74115beb..6b5173605154 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -783,8 +783,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
}
inode->i_mapping->a_ops = &ceph_aops;
- inode->i_mapping->backing_dev_info =
- &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
switch (inode->i_mode & S_IFMT) {
case S_IFIFO:
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 50f06cddc94b..5ae62587a71d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -40,17 +40,6 @@ static void ceph_put_super(struct super_block *s)
dout("put_super\n");
ceph_mdsc_close_sessions(fsc->mdsc);
-
- /*
- * ensure we release the bdi before put_anon_super releases
- * the device name.
- */
- if (s->s_bdi == &fsc->backing_dev_info) {
- bdi_unregister(&fsc->backing_dev_info);
- s->s_bdi = NULL;
- }
-
- return;
}
static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -910,7 +899,7 @@ static int ceph_register_bdi(struct super_block *sb,
>> PAGE_SHIFT;
else
fsc->backing_dev_info.ra_pages =
- default_backing_dev_info.ra_pages;
+ VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
atomic_long_inc_return(&bdi_seq));
@@ -1002,11 +991,16 @@ out_final:
static void ceph_kill_sb(struct super_block *s)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+ dev_t dev = s->s_dev;
+
dout("kill_sb %p\n", s);
+
ceph_mdsc_pre_umount(fsc->mdsc);
- kill_anon_super(s); /* will call put_super after sb is r/o */
+ generic_shutdown_super(s);
ceph_mdsc_destroy(fsc);
+
destroy_fs_client(fsc);
+ free_anon_bdev(dev);
}
static struct file_system_type ceph_fs_type = {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 67b2007f10fe..ea06a3d0364c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -24,27 +24,6 @@
#include "internal.h"
-/*
- * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
- * devices
- * - permits shared-mmap for read, write and/or exec
- * - does not permit private mmap in NOMMU mode (can't do COW)
- * - no readahead or I/O queue unplugging required
- */
-struct backing_dev_info directly_mappable_cdev_bdi = {
- .name = "char",
- .capabilities = (
-#ifdef CONFIG_MMU
- /* permit private copies of the data to be taken */
- BDI_CAP_MAP_COPY |
-#endif
- /* permit direct mmap, for read, write or exec */
- BDI_CAP_MAP_DIRECT |
- BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
- /* no writeback happens */
- BDI_CAP_NO_ACCT_AND_WRITEBACK),
-};
-
static struct kobj_map *cdev_map;
static DEFINE_MUTEX(chrdevs_lock);
@@ -575,8 +554,6 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
void __init chrdev_init(void)
{
cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
- if (bdi_init(&directly_mappable_cdev_bdi))
- panic("Failed to init directly mappable cdev bdi");
}
@@ -590,4 +567,3 @@ EXPORT_SYMBOL(cdev_del);
EXPORT_SYMBOL(cdev_add);
EXPORT_SYMBOL(__register_chrdev);
EXPORT_SYMBOL(__unregister_chrdev);
-EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2a772da16b83..d3aa999ab785 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3446,7 +3446,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
int referral_walks_count = 0;
#endif
- rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+ rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
if (rc)
return rc;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0c3ce464cae4..2d4f37235ed0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -937,8 +937,6 @@ retry_iget5_locked:
inode->i_flags |= S_NOATIME | S_NOCMTIME;
if (inode->i_state & I_NEW) {
inode->i_ino = hash;
- if (S_ISREG(inode->i_mode))
- inode->i_data.backing_dev_info = sb->s_bdi;
#ifdef CONFIG_CIFS_FSCACHE
/* initialize per-inode cache cookie pointer */
CIFS_I(inode)->fscache = NULL;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index b945410bfcd5..82ec68b59208 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,7 +183,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
goto unlock_out;
}
- error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
+ error = bdi_setup_and_register(&vc->bdi, "coda");
if (error)
goto unlock_out;
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index bd4a3c167091..a315677e44d3 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -70,8 +70,6 @@ extern int configfs_is_root(struct config_item *item);
extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
-extern int configfs_inode_init(void);
-extern void configfs_inode_exit(void);
extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
extern int configfs_make_dirent(struct configfs_dirent *,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5946ad98053f..65af86147154 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -50,12 +50,6 @@ static const struct address_space_operations configfs_aops = {
.write_end = simple_write_end,
};
-static struct backing_dev_info configfs_backing_dev_info = {
- .name = "configfs",
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
static const struct inode_operations configfs_inode_operations ={
.setattr = configfs_setattr,
};
@@ -137,7 +131,6 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
if (inode) {
inode->i_ino = get_next_ino();
inode->i_mapping->a_ops = &configfs_aops;
- inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
inode->i_op = &configfs_inode_operations;
if (sd->s_iattr) {
@@ -283,13 +276,3 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
}
mutex_unlock(&dir->d_inode->i_mutex);
}
-
-int __init configfs_inode_init(void)
-{
- return bdi_init(&configfs_backing_dev_info);
-}
-
-void configfs_inode_exit(void)
-{
- bdi_destroy(&configfs_backing_dev_info);
-}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f6c285833390..da94e41bdbf6 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -145,19 +145,13 @@ static int __init configfs_init(void)
if (!config_kobj)
goto out2;
- err = configfs_inode_init();
- if (err)
- goto out3;
-
err = register_filesystem(&configfs_fs_type);
if (err)
- goto out4;
+ goto out3;
return 0;
-out4:
- pr_err("Unable to register filesystem!\n");
- configfs_inode_exit();
out3:
+ pr_err("Unable to register filesystem!\n");
kobject_put(config_kobj);
out2:
kmem_cache_destroy(configfs_dir_cachep);
@@ -172,7 +166,6 @@ static void __exit configfs_exit(void)
kobject_put(config_kobj);
kmem_cache_destroy(configfs_dir_cachep);
configfs_dir_cachep = NULL;
- configfs_inode_exit();
}
MODULE_AUTHOR("Oracle");
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1686dc2da9fd..34b36a504059 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -67,7 +67,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
inode->i_ino = lower_inode->i_ino;
inode->i_version++;
inode->i_mapping->a_ops = &ecryptfs_aops;
- inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
if (S_ISLNK(inode->i_mode))
inode->i_op = &ecryptfs_symlink_iops;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d9eb84bda559..1895d60f4122 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -520,7 +520,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
goto out;
}
- rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
+ rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
if (rc)
goto out1;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f1d3d4eb8c4f..6fc91df99ff8 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1214,7 +1214,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
}
- inode->i_mapping->backing_dev_info = sb->s_bdi;
if (S_ISREG(inode->i_mode)) {
inode->i_op = &exofs_file_inode_operations;
inode->i_fop = &exofs_file_operations;
@@ -1314,7 +1313,6 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
set_obj_2bcreated(oi);
- inode->i_mapping->backing_dev_info = sb->s_bdi;
inode_init_owner(inode, dir, mode);
inode->i_ino = sbi->s_nextid++;
inode->i_blkbits = EXOFS_BLKSHIFT;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 95965503afcb..fcc2e565f540 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -836,7 +836,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
}
- ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+ ret = bdi_setup_and_register(&sbi->bdi, "exofs");
if (ret) {
EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
dput(sb->s_root);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7d66fb0e4cca..6c14bb8322fa 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,7 +170,7 @@ static void ext2_preread_inode(struct inode *inode)
struct ext2_group_desc * gdp;
struct backing_dev_info *bdi;
- bdi = inode->i_mapping->backing_dev_info;
+ bdi = inode_to_bdi(inode);
if (bdi_read_congested(bdi))
return;
if (bdi_write_congested(bdi))
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index e98171a11cfe..bbc5fec6ff7f 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -13,18 +13,12 @@
#include "ext2.h"
#include "xip.h"
-static inline int
-__inode_direct_access(struct inode *inode, sector_t block,
- void **kaddr, unsigned long *pfn)
+static inline long __inode_direct_access(struct inode *inode, sector_t block,
+ void **kaddr, unsigned long *pfn, long size)
{
struct block_device *bdev = inode->i_sb->s_bdev;
- const struct block_device_operations *ops = bdev->bd_disk->fops;
- sector_t sector;
-
- sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
-
- BUG_ON(!ops->direct_access);
- return ops->direct_access(bdev, sector, kaddr, pfn);
+ sector_t sector = block * (PAGE_SIZE / 512);
+ return bdev_direct_access(bdev, sector, kaddr, pfn, size);
}
static inline int
@@ -53,12 +47,13 @@ ext2_clear_xip_target(struct inode *inode, sector_t block)
{
void *kaddr;
unsigned long pfn;
- int rc;
+ long size;
- rc = __inode_direct_access(inode, block, &kaddr, &pfn);
- if (!rc)
- clear_page(kaddr);
- return rc;
+ size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE);
+ if (size < 0)
+ return size;
+ clear_page(kaddr);
+ return 0;
}
void ext2_xip_verify_sb(struct super_block *sb)
@@ -77,7 +72,7 @@ void ext2_xip_verify_sb(struct super_block *sb)
int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
void **kmem, unsigned long *pfn)
{
- int rc;
+ long rc;
sector_t block;
/* first, retrieve the sector number */
@@ -86,6 +81,6 @@ int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
return rc;
/* retrieve address of the target data */
- rc = __inode_direct_access(mapping->host, block, kmem, pfn);
- return rc;
+ rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE);
+ return (rc < 0) ? rc : 0;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ac64edbe501d..64c39c7c594f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -334,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func,
static int block_device_ejected(struct super_block *sb)
{
struct inode *bd_inode = sb->s_bdev->bd_inode;
- struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
return bdi->dev == NULL;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5fbfea..c399152de397 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -66,15 +66,21 @@ int writeback_in_progress(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(writeback_in_progress);
-static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
- struct super_block *sb = inode->i_sb;
+ struct super_block *sb;
- if (sb_is_blkdev_sb(sb))
- return inode->i_mapping->backing_dev_info;
+ if (!inode)
+ return &noop_backing_dev_info;
+ sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+ if (sb_is_blkdev_sb(sb))
+ return blk_get_backing_dev_info(I_BDEV(inode));
+#endif
return sb->s_bdi;
}
+EXPORT_SYMBOL_GPL(inode_to_bdi);
static inline struct inode *wb_inode(struct list_head *head)
{
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d769e594855b..c01ec3bdcfd8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1159,7 +1159,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
@@ -1464,7 +1464,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
{
struct inode *inode = req->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
- struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
list_del(&req->writepages_entry);
@@ -1658,7 +1658,7 @@ static int fuse_writepage_locked(struct page *page)
req->end = fuse_writepage_end;
req->inode = inode;
- inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+ inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
spin_lock(&fc->lock);
@@ -1768,7 +1768,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
old_req->state == FUSE_REQ_PENDING)) {
- struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
copy_highpage(old_req->pages[0], page);
spin_unlock(&fc->lock);
@@ -1872,7 +1872,7 @@ static int fuse_writepages_fill(struct page *page,
req->page_descs[req->num_pages].offset = 0;
req->page_descs[req->num_pages].length = PAGE_SIZE;
- inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+ inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f38256e4476e..e8799c11424b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -308,7 +308,6 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
if (!fc->writeback_cache || !S_ISREG(attr->mode))
inode->i_flags |= S_NOCMTIME;
inode->i_generation = generation;
- inode->i_data.backing_dev_info = &fc->bdi;
fuse_init_inode(inode, attr);
unlock_new_inode(inode);
} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 805b37fed638..4ad4f94edebe 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -289,7 +289,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- trace_wbc_writepage(wbc, mapping->backing_dev_info);
+ trace_wbc_writepage(wbc, inode_to_bdi(inode));
ret = __gfs2_jdata_writepage(page, wbc);
if (unlikely(ret)) {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index aeb7bc958a18..f42dffba056a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -768,7 +768,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->private_data = NULL;
- mapping->backing_dev_info = s->s_bdi;
mapping->writeback_index = 0;
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8633ad328ee2..efc8e254787c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -112,7 +112,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->private_data = NULL;
- mapping->backing_dev_info = sb->s_bdi;
mapping->writeback_index = 0;
spin_lock_init(&sdp->sd_log_lock);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 5b327f837de7..1666382b198d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -743,7 +743,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
- struct backing_dev_info *bdi = metamapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(metamapping->host);
int ret = 0;
if (wbc->sync_mode == WB_SYNC_ALL)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5eba47f593f8..c274aca8e8dc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -62,12 +62,6 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}
-static struct backing_dev_info hugetlbfs_backing_dev_info = {
- .name = "hugetlbfs",
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
int sysctl_hugetlb_shm_group;
enum {
@@ -498,7 +492,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
- inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_mapping->private_data = resv_map;
info = HUGETLBFS_I(inode);
@@ -1032,10 +1025,6 @@ static int __init init_hugetlbfs_fs(void)
return -ENOTSUPP;
}
- error = bdi_init(&hugetlbfs_backing_dev_info);
- if (error)
- return error;
-
error = -ENOMEM;
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
sizeof(struct hugetlbfs_inode_info),
@@ -1071,7 +1060,6 @@ static int __init init_hugetlbfs_fs(void)
out:
kmem_cache_destroy(hugetlbfs_inode_cachep);
out2:
- bdi_destroy(&hugetlbfs_backing_dev_info);
return error;
}
@@ -1091,7 +1079,6 @@ static void __exit exit_hugetlbfs_fs(void)
for_each_hstate(h)
kern_unmount(hugetlbfs_vfsmount[i++]);
unregister_filesystem(&hugetlbfs_fs_type);
- bdi_destroy(&hugetlbfs_backing_dev_info);
}
module_init(init_hugetlbfs_fs)
diff --git a/fs/inode.c b/fs/inode.c
index 86c612b92c6f..86bfaca724db 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -170,20 +170,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
atomic_set(&mapping->i_mmap_writable, 0);
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
mapping->private_data = NULL;
- mapping->backing_dev_info = &default_backing_dev_info;
mapping->writeback_index = 0;
-
- /*
- * If the block_device provides a backing_dev_info for client
- * inodes then use that. Otherwise the inode share the bdev's
- * backing_dev_info.
- */
- if (sb->s_bdev) {
- struct backing_dev_info *bdi;
-
- bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
- mapping->backing_dev_info = bdi;
- }
inode->i_private = NULL;
inode->i_mapping = mapping;
INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 985217626e66..9000874a945b 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -24,12 +24,6 @@ static const struct address_space_operations kernfs_aops = {
.write_end = simple_write_end,
};
-static struct backing_dev_info kernfs_bdi = {
- .name = "kernfs",
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
static const struct inode_operations kernfs_iops = {
.permission = kernfs_iop_permission,
.setattr = kernfs_iop_setattr,
@@ -40,12 +34,6 @@ static const struct inode_operations kernfs_iops = {
.listxattr = kernfs_iop_listxattr,
};
-void __init kernfs_inode_init(void)
-{
- if (bdi_init(&kernfs_bdi))
- panic("failed to init kernfs_bdi");
-}
-
static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
{
static DEFINE_MUTEX(iattr_mutex);
@@ -298,7 +286,6 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
kernfs_get(kn);
inode->i_private = kn;
inode->i_mapping->a_ops = &kernfs_aops;
- inode->i_mapping->backing_dev_info = &kernfs_bdi;
inode->i_op = &kernfs_iops;
set_default_inode_attr(inode, kn->mode);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index dc84a3ef9ca2..af9fa7499919 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -88,7 +88,6 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
size_t size);
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
-void kernfs_inode_init(void);
/*
* dir.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f973ae9b05f1..8eaf417187f1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -246,5 +246,4 @@ void __init kernfs_init(void)
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
sizeof(struct kernfs_node),
0, SLAB_PANIC, NULL);
- kernfs_inode_init();
}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index e31e589369a4..01a9e16e9782 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -267,7 +267,6 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
if (inode) {
atomic_set(&NCP_FINFO(inode)->opened, info->opened);
- inode->i_mapping->backing_dev_info = sb->s_bdi;
inode->i_ino = info->ino;
ncp_set_attr(inode, info);
if (S_ISREG(inode->i_mode)) {
@@ -560,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
server = NCP_SBP(sb);
memset(server, 0, sizeof(*server));
- error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
+ error = bdi_setup_and_register(&server->bdi, "ncpfs");
if (error)
goto out_fput;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 3c9769441f36..7ae1c263c5cf 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -1002,7 +1002,7 @@ mds_commit:
spin_unlock(cinfo->lock);
if (!cinfo->dreq) {
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+ inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
BDI_RECLAIMABLE);
__mark_inode_dirty(req->wb_context->dentry->d_inode,
I_DIRTY_DATASYNC);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index f29fb7d7e8f8..c22ecaa86c1c 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1366,7 +1366,7 @@ ff_layout_mark_request_commit(struct nfs_page *req,
spin_unlock(cinfo->lock);
if (!cinfo->dreq) {
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+ inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
BDI_RECLAIMABLE);
__mark_inode_dirty(req->wb_context->dentry->d_inode,
I_DIRTY_DATASYNC);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d2398c193bda..e4f0dcef8f54 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -388,7 +388,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
if (S_ISREG(inode->i_mode)) {
inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
inode->i_data.a_ops = &nfs_file_aops;
- inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
inode->i_fop = &nfs_dir_operations;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 21469e6e3834..212b8c883d22 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -430,7 +430,6 @@ int nfs_show_options(struct seq_file *, struct dentry *);
int nfs_show_devname(struct seq_file *, struct dentry *);
int nfs_show_path(struct seq_file *, struct dentry *);
int nfs_show_stats(struct seq_file *, struct dentry *);
-void nfs_put_super(struct super_block *);
int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
/* write.c */
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 48cea3c30e5d..75090feeafad 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -53,7 +53,6 @@ static const struct super_operations nfs4_sops = {
.destroy_inode = nfs_destroy_inode,
.write_inode = nfs4_write_inode,
.drop_inode = nfs_drop_inode,
- .put_super = nfs_put_super,
.statfs = nfs_statfs,
.evict_inode = nfs4_evict_inode,
.umount_begin = nfs_umount_begin,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 368d9395d2e7..322b2de02988 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -311,7 +311,6 @@ const struct super_operations nfs_sops = {
.destroy_inode = nfs_destroy_inode,
.write_inode = nfs_write_inode,
.drop_inode = nfs_drop_inode,
- .put_super = nfs_put_super,
.statfs = nfs_statfs,
.evict_inode = nfs_evict_inode,
.umount_begin = nfs_umount_begin,
@@ -2572,7 +2571,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
error = nfs_bdi_register(server);
if (error) {
mntroot = ERR_PTR(error);
- goto error_splat_bdi;
+ goto error_splat_super;
}
server->super = s;
}
@@ -2604,9 +2603,6 @@ error_splat_root:
dput(mntroot);
mntroot = ERR_PTR(error);
error_splat_super:
- if (server && !s->s_root)
- bdi_unregister(&server->backing_dev_info);
-error_splat_bdi:
deactivate_locked_super(s);
goto out;
}
@@ -2654,27 +2650,19 @@ out:
EXPORT_SYMBOL_GPL(nfs_fs_mount);
/*
- * Ensure that we unregister the bdi before kill_anon_super
- * releases the device name
- */
-void nfs_put_super(struct super_block *s)
-{
- struct nfs_server *server = NFS_SB(s);
-
- bdi_unregister(&server->backing_dev_info);
-}
-EXPORT_SYMBOL_GPL(nfs_put_super);
-
-/*
* Destroy an NFS2/3 superblock
*/
void nfs_kill_super(struct super_block *s)
{
struct nfs_server *server = NFS_SB(s);
+ dev_t dev = s->s_dev;
+
+ generic_shutdown_super(s);
- kill_anon_super(s);
nfs_fscache_release_super_cookie(s);
+
nfs_free_server(server);
+ free_anon_bdev(dev);
}
EXPORT_SYMBOL_GPL(nfs_kill_super);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index bcf83e535f29..88a6d2196ece 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -791,7 +791,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
spin_unlock(cinfo->lock);
if (!cinfo->dreq) {
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+ inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
BDI_RECLAIMABLE);
__mark_inode_dirty(req->wb_context->dentry->d_inode,
I_DIRTY_DATASYNC);
@@ -858,7 +858,7 @@ static void
nfs_clear_page_commit(struct page *page)
{
dec_zone_page_state(page, NR_UNSTABLE_NFS);
- dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
+ dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
}
/* Called holding inode (/cinfo) lock */
@@ -1607,7 +1607,7 @@ void nfs_retry_commit(struct list_head *page_list,
nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
if (!cinfo->dreq) {
dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+ dec_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
BDI_RECLAIMABLE);
}
nfs_unlock_and_release_request(req);
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 57ceaf33d177..748ca238915a 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -172,7 +172,6 @@ int nilfs_init_gcinode(struct inode *inode)
inode->i_mode = S_IFREG;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
inode->i_mapping->a_ops = &empty_aops;
- inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
ii->i_flags = 0;
nilfs_bmap_init_gc(ii->i_bmap);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c4dcd1db57ee..892cf5ffdb8e 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -429,7 +429,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
inode->i_mode = S_IFREG;
mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
- inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
inode->i_op = &def_mdt_iops;
inode->i_fop = &def_mdt_fops;
@@ -457,13 +456,12 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
struct nilfs_shadow_map *shadow)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
- struct backing_dev_info *bdi = inode->i_sb->s_bdi;
INIT_LIST_HEAD(&shadow->frozen_buffers);
address_space_init_once(&shadow->frozen_data);
- nilfs_mapping_init(&shadow->frozen_data, inode, bdi);
+ nilfs_mapping_init(&shadow->frozen_data, inode);
address_space_init_once(&shadow->frozen_btnodes);
- nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi);
+ nilfs_mapping_init(&shadow->frozen_btnodes, inode);
mi->mi_shadow = shadow;
return 0;
}
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index da276640f776..700ecbcca55d 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -461,14 +461,12 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
return nc;
}
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
- struct backing_dev_info *bdi)
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
{
mapping->host = inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->private_data = NULL;
- mapping->backing_dev_info = bdi;
mapping->a_ops = &empty_aops;
}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index ef30c5c2426f..a43b8287d012 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -57,8 +57,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
void nilfs_copy_back_pages(struct address_space *, struct address_space *);
void nilfs_clear_dirty_page(struct page *, bool);
void nilfs_clear_dirty_pages(struct address_space *, bool);
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
- struct backing_dev_info *bdi);
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
sector_t start_blk,
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 2e5b3ec85b8f..5bc2a1cf73c3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -166,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
ii->i_state = 0;
ii->i_cno = 0;
ii->vfs_inode.i_version = 1;
- nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi);
+ nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
return &ii->vfs_inode;
}
@@ -1057,7 +1057,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct the_nilfs *nilfs;
struct nilfs_root *fsroot;
- struct backing_dev_info *bdi;
__u64 cno;
int err;
@@ -1077,8 +1076,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_gran = 1;
sb->s_max_links = NILFS_LINK_MAX;
- bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
- sb->s_bdi = bdi ? : &default_backing_dev_info;
+ sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
err = load_nilfs(nilfs, sb);
if (err)
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 643faa44f22b..1da9b2d184dc 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,6 +19,7 @@
* Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/gfp.h>
#include <linux/pagemap.h>
@@ -2091,7 +2092,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
count = iov_length(iov, nr_segs);
pos = *ppos;
/* We can write back this queue in page reclaim. */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
written = 0;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 57c40e34f56f..061ba6a91bf2 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -390,12 +390,6 @@ clear_fields:
ip->ip_conn = NULL;
}
-static struct backing_dev_info dlmfs_backing_dev_info = {
- .name = "ocfs2-dlmfs",
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
static struct inode *dlmfs_get_root_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
@@ -404,7 +398,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(inode, NULL, mode);
- inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inc_nlink(inode);
@@ -428,7 +421,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
inode->i_ino = get_next_ino();
inode_init_owner(inode, parent, mode);
- inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
ip = DLMFS_I(inode);
@@ -643,10 +635,6 @@ static int __init init_dlmfs_fs(void)
int status;
int cleanup_inode = 0, cleanup_worker = 0;
- status = bdi_init(&dlmfs_backing_dev_info);
- if (status)
- return status;
-
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
sizeof(struct dlmfs_inode_private),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -673,7 +661,6 @@ bail:
kmem_cache_destroy(dlmfs_inode_cache);
if (cleanup_worker)
destroy_workqueue(user_dlm_worker);
- bdi_destroy(&dlmfs_backing_dev_info);
} else
printk("OCFS2 User DLM kernel interface loaded\n");
return status;
@@ -693,7 +680,6 @@ static void __exit exit_dlmfs_fs(void)
rcu_barrier();
kmem_cache_destroy(dlmfs_inode_cache);
- bdi_destroy(&dlmfs_backing_dev_info);
}
MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 245db4f504da..e0f04d55fd05 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2363,7 +2363,7 @@ relock:
goto out_dio;
}
} else {
- current->backing_dev_info = file->f_mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
written = generic_perform_write(file, from, *ppos);
if (likely(written >= 0))
iocb->ki_pos = *ppos + written;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index bbafbde3471a..f6ab41b39612 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -34,7 +34,14 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
unsigned long flags);
static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
+static unsigned ramfs_mmap_capabilities(struct file *file)
+{
+ return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ |
+ NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
+
const struct file_operations ramfs_file_operations = {
+ .mmap_capabilities = ramfs_mmap_capabilities,
.mmap = ramfs_nommu_mmap,
.get_unmapped_area = ramfs_nommu_get_unmapped_area,
.read = new_sync_read,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d365b1c4eb3c..889d558b4e05 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -50,14 +50,6 @@ static const struct address_space_operations ramfs_aops = {
.set_page_dirty = __set_page_dirty_no_writeback,
};
-static struct backing_dev_info ramfs_backing_dev_info = {
- .name = "ramfs",
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK |
- BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
- BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
-};
-
struct inode *ramfs_get_inode(struct super_block *sb,
const struct inode *dir, umode_t mode, dev_t dev)
{
@@ -67,7 +59,6 @@ struct inode *ramfs_get_inode(struct super_block *sb,
inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_mapping->a_ops = &ramfs_aops;
- inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -267,19 +258,9 @@ static struct file_system_type ramfs_fs_type = {
int __init init_ramfs_fs(void)
{
static unsigned long once;
- int err;
if (test_and_set_bit(0, &once))
return 0;
-
- err = bdi_init(&ramfs_backing_dev_info);
- if (err)
- return err;
-
- err = register_filesystem(&ramfs_fs_type);
- if (err)
- bdi_destroy(&ramfs_backing_dev_info);
-
- return err;
+ return register_filesystem(&ramfs_fs_type);
}
fs_initcall(init_ramfs_fs);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index ea06c7554860..7da9e2153953 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -70,6 +70,15 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
}
+static unsigned romfs_mmap_capabilities(struct file *file)
+{
+ struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd;
+
+ if (!mtd)
+ return NOMMU_MAP_COPY;
+ return mtd_mmap_capabilities(mtd);
+}
+
const struct file_operations romfs_ro_fops = {
.llseek = generic_file_llseek,
.read = new_sync_read,
@@ -77,4 +86,5 @@ const struct file_operations romfs_ro_fops = {
.splice_read = generic_file_splice_read,
.mmap = romfs_mmap,
.get_unmapped_area = romfs_get_unmapped_area,
+ .mmap_capabilities = romfs_mmap_capabilities,
};
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e98dd88197d5..268733cda397 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -355,9 +355,6 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
case ROMFH_REG:
i->i_fop = &romfs_ro_fops;
i->i_data.a_ops = &romfs_aops;
- if (i->i_sb->s_mtd)
- i->i_data.backing_dev_info =
- i->i_sb->s_mtd->backing_dev_info;
if (nextfh & ROMFH_EXEC)
mode |= S_IXUGO;
break;
diff --git a/fs/super.c b/fs/super.c
index a89d6250bab8..1facd2c282e5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,8 +36,8 @@
#include "internal.h"
-LIST_HEAD(super_blocks);
-DEFINE_SPINLOCK(sb_lock);
+static LIST_HEAD(super_blocks);
+static DEFINE_SPINLOCK(sb_lock);
static char *sb_writers_name[SB_FREEZE_LEVELS] = {
"sb_writers",
@@ -186,8 +186,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
}
init_waitqueue_head(&s->s_writers.wait);
init_waitqueue_head(&s->s_writers.wait_unfrozen);
+ s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags;
- s->s_bdi = &default_backing_dev_info;
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
@@ -872,10 +872,7 @@ EXPORT_SYMBOL(free_anon_bdev);
int set_anon_super(struct super_block *s, void *data)
{
- int error = get_anon_bdev(&s->s_dev);
- if (!error)
- s->s_bdi = &noop_backing_dev_info;
- return error;
+ return get_anon_bdev(&s->s_dev);
}
EXPORT_SYMBOL(set_anon_super);
@@ -1120,7 +1117,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
sb = root->d_sb;
BUG_ON(!sb);
WARN_ON(!sb->s_bdi);
- WARN_ON(sb->s_bdi == &default_backing_dev_info);
sb->s_flags |= MS_BORN;
error = security_sb_kern_mount(sb, flags, secdata);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ea41649e4ca5..c49b1981ac95 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -108,8 +108,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
inode->i_mtime = inode->i_atime = inode->i_ctime =
ubifs_current_time(inode);
inode->i_mapping->nrpages = 0;
- /* Disable readahead */
- inode->i_mapping->backing_dev_info = &c->bdi;
switch (mode & S_IFMT) {
case S_IFREG:
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 106bf20629ce..6197154f36ca 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -156,9 +156,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
if (err)
goto out_invalid;
- /* Disable read-ahead */
- inode->i_mapping->backing_dev_info = &c->bdi;
-
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &ubifs_file_address_operations;
@@ -2017,7 +2014,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
* Read-ahead will be disabled because @c->bdi.ra_pages is 0.
*/
c->bdi.name = "ubifs",
- c->bdi.capabilities = BDI_CAP_MAP_COPY;
+ c->bdi.capabilities = 0;
err = bdi_init(&c->bdi);
if (err)
goto out_close;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f2d05a19d68c..1cdba95c78cb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -735,7 +735,7 @@ xfs_file_buffered_aio_write(
iov_iter_truncate(from, count);
/* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
write_retry:
trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5da6012b7a14..d94077fea1f8 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -106,6 +106,8 @@ struct backing_dev_info {
#endif
};
+struct backing_dev_info *inode_to_bdi(struct inode *inode);
+
int __must_check bdi_init(struct backing_dev_info *bdi);
void bdi_destroy(struct backing_dev_info *bdi);
@@ -114,7 +116,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...);
int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
void bdi_unregister(struct backing_dev_info *bdi);
-int __must_check bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
+int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
enum wb_reason reason);
void bdi_start_background_writeback(struct backing_dev_info *bdi);
@@ -228,46 +230,17 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
* BDI_CAP_NO_ACCT_DIRTY: Dirty pages shouldn't contribute to accounting
* BDI_CAP_NO_WRITEBACK: Don't write pages back
* BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages
- *
- * These flags let !MMU mmap() govern direct device mapping vs immediate
- * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
- *
- * BDI_CAP_MAP_COPY: Copy can be mapped (MAP_PRIVATE)
- * BDI_CAP_MAP_DIRECT: Can be mapped directly (MAP_SHARED)
- * BDI_CAP_READ_MAP: Can be mapped for reading
- * BDI_CAP_WRITE_MAP: Can be mapped for writing
- * BDI_CAP_EXEC_MAP: Can be mapped for execution
- *
- * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed.
- *
* BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold.
*/
#define BDI_CAP_NO_ACCT_DIRTY 0x00000001
#define BDI_CAP_NO_WRITEBACK 0x00000002
-#define BDI_CAP_MAP_COPY 0x00000004
-#define BDI_CAP_MAP_DIRECT 0x00000008
-#define BDI_CAP_READ_MAP 0x00000010
-#define BDI_CAP_WRITE_MAP 0x00000020
-#define BDI_CAP_EXEC_MAP 0x00000040
-#define BDI_CAP_NO_ACCT_WB 0x00000080
-#define BDI_CAP_SWAP_BACKED 0x00000100
-#define BDI_CAP_STABLE_WRITES 0x00000200
-#define BDI_CAP_STRICTLIMIT 0x00000400
-
-#define BDI_CAP_VMFLAGS \
- (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
+#define BDI_CAP_NO_ACCT_WB 0x00000004
+#define BDI_CAP_STABLE_WRITES 0x00000008
+#define BDI_CAP_STRICTLIMIT 0x00000010
#define BDI_CAP_NO_ACCT_AND_WRITEBACK \
(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
-#if defined(VM_MAYREAD) && \
- (BDI_CAP_READ_MAP != VM_MAYREAD || \
- BDI_CAP_WRITE_MAP != VM_MAYWRITE || \
- BDI_CAP_EXEC_MAP != VM_MAYEXEC)
-#error please change backing_dev_info::capabilities flags
-#endif
-
-extern struct backing_dev_info default_backing_dev_info;
extern struct backing_dev_info noop_backing_dev_info;
int writeback_in_progress(struct backing_dev_info *bdi);
@@ -329,24 +302,14 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
BDI_CAP_NO_WRITEBACK));
}
-static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
-{
- return bdi->capabilities & BDI_CAP_SWAP_BACKED;
-}
-
static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
{
- return bdi_cap_writeback_dirty(mapping->backing_dev_info);
+ return bdi_cap_writeback_dirty(inode_to_bdi(mapping->host));
}
static inline bool mapping_cap_account_dirty(struct address_space *mapping)
{
- return bdi_cap_account_dirty(mapping->backing_dev_info);
-}
-
-static inline bool mapping_cap_swap_backed(struct address_space *mapping)
-{
- return bdi_cap_swap_backed(mapping->backing_dev_info);
+ return bdi_cap_account_dirty(inode_to_bdi(mapping->host));
}
static inline int bdi_sched_wait(void *word)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index efead0b532c4..da3a127c9958 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -428,13 +428,9 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
unsigned int, unsigned int);
extern int bio_get_nr_vecs(struct block_device *);
-extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
- unsigned long, unsigned int, int, gfp_t);
-struct sg_iovec;
struct rq_map_data;
extern struct bio *bio_map_user_iov(struct request_queue *,
- struct block_device *,
- const struct sg_iovec *, int, int, gfp_t);
+ const struct iov_iter *, gfp_t);
extern void bio_unmap_user(struct bio *);
extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
gfp_t);
@@ -462,12 +458,10 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
extern void bio_copy_data(struct bio *dst, struct bio *src);
extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
-extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
- unsigned long, unsigned int, int, gfp_t);
extern struct bio *bio_copy_user_iov(struct request_queue *,
struct rq_map_data *,
- const struct sg_iovec *,
- int, int, gfp_t);
+ const struct iov_iter *,
+ gfp_t);
extern int bio_uncopy_user(struct bio *);
void zero_fill_bio(struct bio *bio);
extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5735e7130d63..7aec86127335 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -146,6 +146,8 @@ enum {
BLK_MQ_F_SG_MERGE = 1 << 2,
BLK_MQ_F_SYSFS_UP = 1 << 3,
BLK_MQ_F_DEFER_ISSUE = 1 << 4,
+ BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
+ BLK_MQ_F_ALLOC_POLICY_BITS = 1,
BLK_MQ_S_STOPPED = 0,
BLK_MQ_S_TAG_ACTIVE = 1,
@@ -154,6 +156,12 @@ enum {
BLK_MQ_CPU_WORK_BATCH = 8,
};
+#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
+ ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \
+ ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1))
+#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \
+ ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
+ << BLK_MQ_F_ALLOC_POLICY_START_BIT)
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
void blk_mq_finish_init(struct request_queue *q);
@@ -166,7 +174,6 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
void blk_mq_insert_request(struct request *, bool, bool, bool);
-void blk_mq_run_queues(struct request_queue *q, bool async);
void blk_mq_free_request(struct request *rq);
void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
@@ -214,6 +221,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
void *priv);
+void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_unfreeze_queue(struct request_queue *q);
void blk_mq_freeze_queue_start(struct request_queue *q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 92f4b4b288dd..7f9a516f24de 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -272,7 +272,11 @@ struct blk_queue_tag {
int max_depth; /* what we will send to device */
int real_max_depth; /* what the array can hold */
atomic_t refcnt; /* map can be shared */
+ int alloc_policy; /* tag allocation policy */
+ int next_tag; /* next tag */
};
+#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
+#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
#define BLK_SCSI_MAX_CMDS (256)
#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
@@ -516,6 +520,7 @@ struct request_queue {
(1 << QUEUE_FLAG_ADD_RANDOM))
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
+ (1 << QUEUE_FLAG_STACKABLE) | \
(1 << QUEUE_FLAG_SAME_COMP))
static inline void queue_lockdep_assert_held(struct request_queue *q)
@@ -850,8 +855,8 @@ extern int blk_rq_map_user(struct request_queue *, struct request *,
extern int blk_rq_unmap_user(struct bio *);
extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
- struct rq_map_data *, const struct sg_iovec *,
- int, unsigned int, gfp_t);
+ struct rq_map_data *, const struct iov_iter *,
+ gfp_t);
extern int blk_execute_rq(struct request_queue *, struct gendisk *,
struct request *, int);
extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
@@ -1044,8 +1049,6 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
-extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
- struct scatterlist *sglist);
extern void blk_dump_rq_flags(struct request *, char *);
extern long nr_blockdev_pages(void);
@@ -1139,11 +1142,11 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
extern int blk_queue_start_tag(struct request_queue *, struct request *);
extern struct request *blk_queue_find_tag(struct request_queue *, int);
extern void blk_queue_end_tag(struct request_queue *, struct request *);
-extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *);
+extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int);
extern void blk_queue_free_tags(struct request_queue *);
extern int blk_queue_resize_tags(struct request_queue *, int);
extern void blk_queue_invalidate_tags(struct request_queue *);
-extern struct blk_queue_tag *blk_init_tags(int);
+extern struct blk_queue_tag *blk_init_tags(int, int);
extern void blk_free_tags(struct blk_queue_tag *);
static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
@@ -1162,7 +1165,7 @@ extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct page *page);
extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask);
+ sector_t nr_sects, gfp_t gfp_mask, bool discard);
static inline int sb_issue_discard(struct super_block *sb, sector_t block,
sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
{
@@ -1176,7 +1179,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
return blkdev_issue_zeroout(sb->s_bdev,
block << (sb->s_blocksize_bits - 9),
nr_blocks << (sb->s_blocksize_bits - 9),
- gfp_mask);
+ gfp_mask, true);
}
extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
@@ -1601,8 +1604,8 @@ struct block_device_operations {
int (*rw_page)(struct block_device *, sector_t, struct page *, int rw);
int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
- int (*direct_access) (struct block_device *, sector_t,
- void **, unsigned long *);
+ long (*direct_access)(struct block_device *, sector_t,
+ void **, unsigned long *pfn, long size);
unsigned int (*check_events) (struct gendisk *disk,
unsigned int clearing);
/* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1620,6 +1623,8 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
extern int bdev_read_page(struct block_device *, sector_t, struct page *);
extern int bdev_write_page(struct block_device *, sector_t, struct page *,
struct writeback_control *);
+extern long bdev_direct_access(struct block_device *, sector_t, void **addr,
+ unsigned long *pfn, long size);
#else /* CONFIG_BLOCK */
struct block_device;
diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index fb4591977b03..f8763615a5f2 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -30,6 +30,4 @@ void cdev_del(struct cdev *);
void cd_forget(struct inode *);
-extern struct backing_dev_info directly_mappable_cdev_bdi;
-
#endif
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ca6d2acc5eb7..2646aed1d3fe 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -48,6 +48,11 @@ typedef void (*dm_dtr_fn) (struct dm_target *ti);
typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio);
typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
union map_info *map_context);
+typedef int (*dm_clone_and_map_request_fn) (struct dm_target *ti,
+ struct request *rq,
+ union map_info *map_context,
+ struct request **clone);
+typedef void (*dm_release_clone_request_fn) (struct request *clone);
/*
* Returns:
@@ -143,6 +148,8 @@ struct target_type {
dm_dtr_fn dtr;
dm_map_fn map;
dm_map_request_fn map_rq;
+ dm_clone_and_map_request_fn clone_and_map_rq;
+ dm_release_clone_request_fn release_clone_rq;
dm_endio_fn end_io;
dm_request_endio_fn rq_end_io;
dm_presuspend_fn presuspend;
@@ -600,9 +607,6 @@ static inline unsigned long to_bytes(sector_t n)
/*-----------------------------------------------------------------
* Helper for block layer and dm core operations
*---------------------------------------------------------------*/
-void dm_dispatch_request(struct request *rq);
-void dm_requeue_unmapped_request(struct request *rq);
-void dm_kill_unmapped_request(struct request *rq, int error);
int dm_underlying_device_busy(struct request_queue *q);
#endif /* _LINUX_DEVICE_MAPPER_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a20d6586e517..e49f10cc8a73 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -34,6 +34,7 @@
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
+struct backing_dev_info;
struct export_operations;
struct hd_geometry;
struct iovec;
@@ -394,7 +395,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
-struct backing_dev_info;
struct address_space {
struct inode *host; /* owner: inode, block_device */
struct radix_tree_root page_tree; /* radix tree of all pages */
@@ -408,7 +408,6 @@ struct address_space {
pgoff_t writeback_index;/* writeback starts here */
const struct address_space_operations *a_ops; /* methods */
unsigned long flags; /* error bits/gfp mask */
- struct backing_dev_info *backing_dev_info; /* device readahead, etc */
spinlock_t private_lock; /* for use by the address_space */
struct list_head private_list; /* ditto */
void *private_data; /* ditto */
@@ -1201,8 +1200,6 @@ struct mm_struct;
#define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */
#define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */
-extern struct list_head super_blocks;
-extern spinlock_t sb_lock;
/* Possible states of 'frozen' field */
enum {
@@ -1519,6 +1516,26 @@ struct block_device_operations;
#define HAVE_COMPAT_IOCTL 1
#define HAVE_UNLOCKED_IOCTL 1
+/*
+ * These flags let !MMU mmap() govern direct device mapping vs immediate
+ * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
+ *
+ * NOMMU_MAP_COPY: Copy can be mapped (MAP_PRIVATE)
+ * NOMMU_MAP_DIRECT: Can be mapped directly (MAP_SHARED)
+ * NOMMU_MAP_READ: Can be mapped for reading
+ * NOMMU_MAP_WRITE: Can be mapped for writing
+ * NOMMU_MAP_EXEC: Can be mapped for execution
+ */
+#define NOMMU_MAP_COPY 0x00000001
+#define NOMMU_MAP_DIRECT 0x00000008
+#define NOMMU_MAP_READ VM_MAYREAD
+#define NOMMU_MAP_WRITE VM_MAYWRITE
+#define NOMMU_MAP_EXEC VM_MAYEXEC
+
+#define NOMMU_VMFLAGS \
+ (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)
+
+
struct iov_iter;
struct file_operations {
@@ -1553,6 +1570,9 @@ struct file_operations {
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
+#ifndef CONFIG_MMU
+ unsigned (*mmap_capabilities)(struct file *);
+#endif
};
struct inode_operations {
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 61df823ac86a..fc03efa64ffe 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -823,10 +823,10 @@ struct ata_port {
unsigned int cbl; /* cable type; ATA_CBL_xxx */
struct ata_queued_cmd qcmd[ATA_MAX_QUEUE];
- unsigned long qc_allocated;
+ unsigned long sas_tag_allocated; /* for sas tag allocation only */
unsigned int qc_active;
int nr_active_links; /* #links with active qcs */
- unsigned int last_tag; /* track next tag hw expects */
+ unsigned int sas_last_tag; /* track next tag hw expects */
struct ata_link link; /* host default link */
struct ata_link *slave_link; /* see ata_slave_link_init() */
@@ -1352,6 +1352,7 @@ extern struct device_attribute *ata_common_sdev_attrs[];
.ioctl = ata_scsi_ioctl, \
.queuecommand = ata_scsi_queuecmd, \
.can_queue = ATA_DEF_QUEUE, \
+ .tag_alloc_policy = BLK_TAG_ALLOC_RR, \
.this_id = ATA_SHT_THIS_ID, \
.cmd_per_lun = ATA_SHT_CMD_PER_LUN, \
.emulated = ATA_SHT_EMULATED, \
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 031ff3a9a0bd..3301c4c289d6 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -408,4 +408,6 @@ static inline int mtd_is_bitflip_or_eccerr(int err) {
return mtd_is_bitflip(err) || mtd_is_eccerr(err);
}
+unsigned mtd_mmap_capabilities(struct mtd_info *mtd);
+
#endif /* __MTD_MTD_H__ */
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 258945fcabf1..19a5d4b23209 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -132,13 +132,12 @@ struct nvme_ns {
* allocated to store the PRP list.
*/
struct nvme_iod {
- void *private; /* For the use of the submitter of the I/O */
+ unsigned long private; /* For the use of the submitter of the I/O */
int npages; /* In the PRP list. 0 means small pool in use */
int offset; /* Of PRP list */
int nents; /* Used in scatterlist */
int length; /* Of data, in bytes */
dma_addr_t first_dma;
- struct list_head node;
struct scatterlist sg[0];
};
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 537d58eea8a0..2db83349865b 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -267,6 +267,21 @@ do { \
__wait_event(wq, condition); \
} while (0)
+#define __io_wait_event(wq, condition) \
+ (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
+ io_schedule())
+
+/*
+ * io_wait_event() -- like wait_event() but with io_schedule()
+ */
+#define io_wait_event(wq, condition) \
+do { \
+ might_sleep(); \
+ if (condition) \
+ break; \
+ __io_wait_event(wq, condition); \
+} while (0)
+
#define __wait_event_freezable(wq, condition) \
___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
schedule(); try_to_freeze())
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 019e66858ce6..e113c757d555 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -402,6 +402,9 @@ struct scsi_host_template {
*/
unsigned char present;
+ /* If use block layer to manage tags, this is tag allocation policy */
+ int tag_alloc_policy;
+
/*
* Let the block layer assigns tags to all commands.
*/
diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h
index 9708b28bd2aa..b27977e8aaed 100644
--- a/include/scsi/scsi_tcq.h
+++ b/include/scsi/scsi_tcq.h
@@ -66,7 +66,8 @@ static inline int scsi_init_shared_tag_map(struct Scsi_Host *shost, int depth)
* devices on the shared host (for libata)
*/
if (!shost->bqt) {
- shost->bqt = blk_init_tags(depth);
+ shost->bqt = blk_init_tags(depth,
+ shost->hostt->tag_alloc_policy);
if (!shost->bqt)
return -ENOMEM;
}
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index cee02d65ab3f..0e9310905413 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -47,7 +47,7 @@ TRACE_EVENT(writeback_dirty_page,
TP_fast_assign(
strncpy(__entry->name,
- mapping ? dev_name(mapping->backing_dev_info->dev) : "(unknown)", 32);
+ mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", 32);
__entry->ino = mapping ? mapping->host->i_ino : 0;
__entry->index = page->index;
),
@@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
),
TP_fast_assign(
- struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
/* may be called for files on pseudo FSes w/ unregistered bdi */
strncpy(__entry->name,
@@ -116,7 +116,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
TP_fast_assign(
strncpy(__entry->name,
- dev_name(inode->i_mapping->backing_dev_info->dev), 32);
+ dev_name(inode_to_bdi(inode)->dev), 32);
__entry->ino = inode->i_ino;
__entry->sync_mode = wbc->sync_mode;
),
@@ -156,10 +156,8 @@ DECLARE_EVENT_CLASS(writeback_work_class,
__field(int, reason)
),
TP_fast_assign(
- struct device *dev = bdi->dev;
- if (!dev)
- dev = default_backing_dev_info.dev;
- strncpy(__entry->name, dev_name(dev), 32);
+ strncpy(__entry->name,
+ bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
__entry->nr_pages = work->nr_pages;
__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
__entry->sync_mode = work->sync_mode;
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index a570d7b5796c..889f3a5b7b18 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
#define DM_VERSION_MAJOR 4
-#define DM_VERSION_MINOR 29
+#define DM_VERSION_MINOR 30
#define DM_VERSION_PATCHLEVEL 0
-#define DM_VERSION_EXTRA "-ioctl (2014-10-28)"
+#define DM_VERSION_EXTRA "-ioctl (2014-12-22)"
/* Status bits */
#define DM_READONLY_FLAG (1 << 0) /* In/Out */
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 7d0e5cd7b570..dbef2314901e 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -89,10 +89,10 @@ void (*raid6_datap_recov)(int, size_t, int, void **);
EXPORT_SYMBOL_GPL(raid6_datap_recov);
const struct raid6_recov_calls *const raid6_recov_algos[] = {
-#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
#ifdef CONFIG_AS_AVX2
&raid6_recov_avx2,
#endif
+#ifdef CONFIG_AS_SSSE3
&raid6_recov_ssse3,
#endif
&raid6_recov_intx1,
diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c
index e1eea433a493..53fe3d7bdfb3 100644
--- a/lib/raid6/recov_avx2.c
+++ b/lib/raid6/recov_avx2.c
@@ -8,7 +8,7 @@
* of the License.
*/
-#if CONFIG_AS_AVX2
+#ifdef CONFIG_AS_AVX2
#include <linux/raid/pq.h>
#include "x86.h"
diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c
index a9168328f03b..cda33e56a5e3 100644
--- a/lib/raid6/recov_ssse3.c
+++ b/lib/raid6/recov_ssse3.c
@@ -7,6 +7,8 @@
* of the License.
*/
+#ifdef CONFIG_AS_SSSE3
+
#include <linux/raid/pq.h>
#include "x86.h"
@@ -330,3 +332,7 @@ const struct raid6_recov_calls raid6_recov_ssse3 = {
#endif
.priority = 1,
};
+
+#else
+#warning "your version of binutils lacks SSSE3 support"
+#endif
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0ae0df55000b..7690ec77c722 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,19 +14,10 @@
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-struct backing_dev_info default_backing_dev_info = {
- .name = "default",
- .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
- .state = 0,
- .capabilities = BDI_CAP_MAP_COPY,
-};
-EXPORT_SYMBOL_GPL(default_backing_dev_info);
-
struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
-EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
@@ -40,17 +31,6 @@ LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;
-static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
-{
- if (wb1 < wb2) {
- spin_lock(&wb1->list_lock);
- spin_lock_nested(&wb2->list_lock, 1);
- } else {
- spin_lock(&wb2->list_lock);
- spin_lock_nested(&wb1->list_lock, 1);
- }
-}
-
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -264,9 +244,6 @@ static int __init default_bdi_init(void)
if (!bdi_wq)
return -ENOMEM;
- err = bdi_init(&default_backing_dev_info);
- if (!err)
- bdi_register(&default_backing_dev_info, NULL, "default");
err = bdi_init(&noop_backing_dev_info);
return err;
@@ -355,19 +332,19 @@ EXPORT_SYMBOL(bdi_register_dev);
*/
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
{
- if (!bdi_cap_writeback_dirty(bdi))
+ /* Make sure nobody queues further work */
+ spin_lock_bh(&bdi->wb_lock);
+ if (!test_and_clear_bit(BDI_registered, &bdi->state)) {
+ spin_unlock_bh(&bdi->wb_lock);
return;
+ }
+ spin_unlock_bh(&bdi->wb_lock);
/*
* Make sure nobody finds us on the bdi_list anymore
*/
bdi_remove_from_list(bdi);
- /* Make sure nobody queues further work */
- spin_lock_bh(&bdi->wb_lock);
- clear_bit(BDI_registered, &bdi->state);
- spin_unlock_bh(&bdi->wb_lock);
-
/*
* Drain work list and shutdown the delayed_work. At this point,
* @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
@@ -375,37 +352,22 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
*/
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
flush_delayed_work(&bdi->wb.dwork);
- WARN_ON(!list_empty(&bdi->work_list));
- WARN_ON(delayed_work_pending(&bdi->wb.dwork));
}
/*
- * This bdi is going away now, make sure that no super_blocks point to it
+ * Called when the device behind @bdi has been removed or ejected.
+ *
+ * We can't really do much here except for reducing the dirty ratio at
+ * the moment. In the future we should be able to set a flag so that
+ * the filesystem can handle errors at mark_inode_dirty time instead
+ * of only at writeback time.
*/
-static void bdi_prune_sb(struct backing_dev_info *bdi)
-{
- struct super_block *sb;
-
- spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
- if (sb->s_bdi == bdi)
- sb->s_bdi = &default_backing_dev_info;
- }
- spin_unlock(&sb_lock);
-}
-
void bdi_unregister(struct backing_dev_info *bdi)
{
- if (bdi->dev) {
- bdi_set_min_ratio(bdi, 0);
- trace_writeback_bdi_unregister(bdi);
- bdi_prune_sb(bdi);
+ if (WARN_ON_ONCE(!bdi->dev))
+ return;
- bdi_wb_shutdown(bdi);
- bdi_debug_unregister(bdi);
- device_unregister(bdi->dev);
- bdi->dev = NULL;
- }
+ bdi_set_min_ratio(bdi, 0);
}
EXPORT_SYMBOL(bdi_unregister);
@@ -474,37 +436,19 @@ void bdi_destroy(struct backing_dev_info *bdi)
{
int i;
- /*
- * Splice our entries to the default_backing_dev_info. This
- * condition shouldn't happen. @wb must be empty at this point and
- * dirty inodes on it might cause other issues. This workaround is
- * added by ce5f8e779519 ("writeback: splice dirty inode entries to
- * default bdi on bdi_destroy()") without root-causing the issue.
- *
- * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com
- * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350
- *
- * We should probably add WARN_ON() to find out whether it still
- * happens and track it down if so.
- */
- if (bdi_has_dirty_io(bdi)) {
- struct bdi_writeback *dst = &default_backing_dev_info.wb;
-
- bdi_lock_two(&bdi->wb, dst);
- list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
- list_splice(&bdi->wb.b_io, &dst->b_io);
- list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- spin_unlock(&bdi->wb.list_lock);
- spin_unlock(&dst->list_lock);
- }
-
- bdi_unregister(bdi);
+ bdi_wb_shutdown(bdi);
+ WARN_ON(!list_empty(&bdi->work_list));
WARN_ON(delayed_work_pending(&bdi->wb.dwork));
+ if (bdi->dev) {
+ bdi_debug_unregister(bdi);
+ device_unregister(bdi->dev);
+ bdi->dev = NULL;
+ }
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
-
fprop_local_destroy_percpu(&bdi->completions);
}
EXPORT_SYMBOL(bdi_destroy);
@@ -513,13 +457,12 @@ EXPORT_SYMBOL(bdi_destroy);
* For use from filesystems to quickly init and register a bdi associated
* with dirty writeback
*/
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
- unsigned int cap)
+int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
{
int err;
bdi->name = name;
- bdi->capabilities = cap;
+ bdi->capabilities = 0;
err = bdi_init(bdi);
if (err)
return err;
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 2ad7adf4f0a4..fac23ecf8d72 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -73,7 +73,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
else
endbyte--; /* inclusive */
- bdi = mapping->backing_dev_info;
+ bdi = inode_to_bdi(mapping->host);
switch (advice) {
case POSIX_FADV_NORMAL:
@@ -113,7 +113,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
case POSIX_FADV_NOREUSE:
break;
case POSIX_FADV_DONTNEED:
- if (!bdi_write_congested(mapping->backing_dev_info))
+ if (!bdi_write_congested(bdi))
__filemap_fdatawrite_range(mapping, offset, endbyte,
WB_SYNC_NONE);
diff --git a/mm/filemap.c b/mm/filemap.c
index bf7a27142704..d9f5336552d7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -211,7 +211,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
*/
if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+ dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
}
}
@@ -2564,7 +2564,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
size_t count = iov_iter_count(from);
/* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
goto out;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 70c09da1a419..c175f9f25210 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -9,6 +9,7 @@
*/
#include <linux/fs.h>
+#include <linux/backing-dev.h>
#include <linux/pagemap.h>
#include <linux/export.h>
#include <linux/uio.h>
@@ -409,7 +410,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
count = len;
/* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
if (ret)
diff --git a/mm/madvise.c b/mm/madvise.c
index d79fb5e8f80a..1077cbdc8b52 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -222,19 +222,22 @@ static long madvise_willneed(struct vm_area_struct *vma,
struct file *file = vma->vm_file;
#ifdef CONFIG_SWAP
- if (!file || mapping_cap_swap_backed(file->f_mapping)) {
+ if (!file) {
*prev = vma;
- if (!file)
- force_swapin_readahead(vma, start, end);
- else
- force_shm_swapin_readahead(vma, start, end,
- file->f_mapping);
+ force_swapin_readahead(vma, start, end);
return 0;
}
-#endif
+ if (shmem_mapping(file->f_mapping)) {
+ *prev = vma;
+ force_shm_swapin_readahead(vma, start, end,
+ file->f_mapping);
+ return 0;
+ }
+#else
if (!file)
return -EBADF;
+#endif
if (file->f_mapping->a_ops->get_xip_mem) {
/* no bad return value, but ignore advice */
diff --git a/mm/nommu.c b/mm/nommu.c
index 1a19fb3b0463..7296360fc057 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -980,9 +980,6 @@ static int validate_mmap_request(struct file *file,
return -EOVERFLOW;
if (file) {
- /* validate file mapping requests */
- struct address_space *mapping;
-
/* files must support mmap */
if (!file->f_op->mmap)
return -ENODEV;
@@ -991,28 +988,22 @@ static int validate_mmap_request(struct file *file,
* - we support chardevs that provide their own "memory"
* - we support files/blockdevs that are memory backed
*/
- mapping = file->f_mapping;
- if (!mapping)
- mapping = file_inode(file)->i_mapping;
-
- capabilities = 0;
- if (mapping && mapping->backing_dev_info)
- capabilities = mapping->backing_dev_info->capabilities;
-
- if (!capabilities) {
+ if (file->f_op->mmap_capabilities) {
+ capabilities = file->f_op->mmap_capabilities(file);
+ } else {
/* no explicit capabilities set, so assume some
* defaults */
switch (file_inode(file)->i_mode & S_IFMT) {
case S_IFREG:
case S_IFBLK:
- capabilities = BDI_CAP_MAP_COPY;
+ capabilities = NOMMU_MAP_COPY;
break;
case S_IFCHR:
capabilities =
- BDI_CAP_MAP_DIRECT |
- BDI_CAP_READ_MAP |
- BDI_CAP_WRITE_MAP;
+ NOMMU_MAP_DIRECT |
+ NOMMU_MAP_READ |
+ NOMMU_MAP_WRITE;
break;
default:
@@ -1023,9 +1014,9 @@ static int validate_mmap_request(struct file *file,
/* eliminate any capabilities that we can't support on this
* device */
if (!file->f_op->get_unmapped_area)
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
if (!file->f_op->read)
- capabilities &= ~BDI_CAP_MAP_COPY;
+ capabilities &= ~NOMMU_MAP_COPY;
/* The file shall have been opened with read permission. */
if (!(file->f_mode & FMODE_READ))
@@ -1044,29 +1035,29 @@ static int validate_mmap_request(struct file *file,
if (locks_verify_locked(file))
return -EAGAIN;
- if (!(capabilities & BDI_CAP_MAP_DIRECT))
+ if (!(capabilities & NOMMU_MAP_DIRECT))
return -ENODEV;
/* we mustn't privatise shared mappings */
- capabilities &= ~BDI_CAP_MAP_COPY;
+ capabilities &= ~NOMMU_MAP_COPY;
} else {
/* we're going to read the file into private memory we
* allocate */
- if (!(capabilities & BDI_CAP_MAP_COPY))
+ if (!(capabilities & NOMMU_MAP_COPY))
return -ENODEV;
/* we don't permit a private writable mapping to be
* shared with the backing device */
if (prot & PROT_WRITE)
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
}
- if (capabilities & BDI_CAP_MAP_DIRECT) {
- if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
- ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
- ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
+ if (capabilities & NOMMU_MAP_DIRECT) {
+ if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) ||
+ ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
+ ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC))
) {
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
if (flags & MAP_SHARED) {
printk(KERN_WARNING
"MAP_SHARED not completely supported on !MMU\n");
@@ -1083,21 +1074,21 @@ static int validate_mmap_request(struct file *file,
} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
/* handle implication of PROT_EXEC by PROT_READ */
if (current->personality & READ_IMPLIES_EXEC) {
- if (capabilities & BDI_CAP_EXEC_MAP)
+ if (capabilities & NOMMU_MAP_EXEC)
prot |= PROT_EXEC;
}
} else if ((prot & PROT_READ) &&
(prot & PROT_EXEC) &&
- !(capabilities & BDI_CAP_EXEC_MAP)
+ !(capabilities & NOMMU_MAP_EXEC)
) {
/* backing file is not executable, try to copy */
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
}
} else {
/* anonymous mappings are always memory backed and can be
* privately mapped
*/
- capabilities = BDI_CAP_MAP_COPY;
+ capabilities = NOMMU_MAP_COPY;
/* handle PROT_EXEC implication by PROT_READ */
if ((prot & PROT_READ) &&
@@ -1129,7 +1120,7 @@ static unsigned long determine_vm_flags(struct file *file,
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
/* vm_flags |= mm->def_flags; */
- if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
+ if (!(capabilities & NOMMU_MAP_DIRECT)) {
/* attempt to share read-only copies of mapped file chunks */
vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (file && !(prot & PROT_WRITE))
@@ -1138,7 +1129,7 @@ static unsigned long determine_vm_flags(struct file *file,
/* overlay a shareable mapping on the backing device or inode
* if possible - used for chardevs, ramfs/tmpfs/shmfs and
* romfs/cramfs */
- vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
+ vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
if (flags & MAP_SHARED)
vm_flags |= VM_SHARED;
}
@@ -1191,7 +1182,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
* shared mappings on devices or memory
* - VM_MAYSHARE will be set if it may attempt to share
*/
- if (capabilities & BDI_CAP_MAP_DIRECT) {
+ if (capabilities & NOMMU_MAP_DIRECT) {
ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
if (ret == 0) {
/* shouldn't return success if we're not sharing */
@@ -1380,7 +1371,7 @@ unsigned long do_mmap_pgoff(struct file *file,
if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
!(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
/* new mapping is not a subset of the region */
- if (!(capabilities & BDI_CAP_MAP_DIRECT))
+ if (!(capabilities & NOMMU_MAP_DIRECT))
goto sharing_violation;
continue;
}
@@ -1419,7 +1410,7 @@ unsigned long do_mmap_pgoff(struct file *file,
* - this is the hook for quasi-memory character devices to
* tell us the location of a shared mapping
*/
- if (capabilities & BDI_CAP_MAP_DIRECT) {
+ if (capabilities & NOMMU_MAP_DIRECT) {
addr = file->f_op->get_unmapped_area(file, addr, len,
pgoff, flags);
if (IS_ERR_VALUE(addr)) {
@@ -1431,10 +1422,10 @@ unsigned long do_mmap_pgoff(struct file *file,
* the mapping so we'll have to attempt to copy
* it */
ret = -ENODEV;
- if (!(capabilities & BDI_CAP_MAP_COPY))
+ if (!(capabilities & NOMMU_MAP_COPY))
goto error_just_free;
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
} else {
vma->vm_start = region->vm_start = addr;
vma->vm_end = region->vm_end = addr + len;
@@ -1445,7 +1436,7 @@ unsigned long do_mmap_pgoff(struct file *file,
vma->vm_region = region;
/* set up the mapping
- * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
+ * - the region is filled in if NOMMU_MAP_DIRECT is still set
*/
if (file && vma->vm_flags & VM_SHARED)
ret = do_mmap_shared_file(vma);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6a73e47e81c6..45e187b2d971 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1351,7 +1351,7 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long task_ratelimit;
unsigned long dirty_ratelimit;
unsigned long pos_ratio;
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
@@ -1574,7 +1574,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
int ratelimit;
int *p;
@@ -1929,7 +1929,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- trace_wbc_writepage(wbc, mapping->backing_dev_info);
+ trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
ret = (*writepage)(page, wbc, data);
if (unlikely(ret)) {
if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -2094,10 +2094,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
trace_writeback_dirty_page(page, mapping);
if (mapping_cap_account_dirty(mapping)) {
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
- __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
- __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+ __inc_bdi_stat(bdi, BDI_RECLAIMABLE);
+ __inc_bdi_stat(bdi, BDI_DIRTIED);
task_io_account_write(PAGE_CACHE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
@@ -2156,7 +2158,7 @@ void account_page_redirty(struct page *page)
if (mapping && mapping_cap_account_dirty(mapping)) {
current->nr_dirtied--;
dec_zone_page_state(page, NR_DIRTIED);
- dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+ dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
}
}
EXPORT_SYMBOL(account_page_redirty);
@@ -2298,7 +2300,7 @@ int clear_page_dirty_for_io(struct page *page)
*/
if (TestClearPageDirty(page)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(mapping->backing_dev_info,
+ dec_bdi_stat(inode_to_bdi(mapping->host),
BDI_RECLAIMABLE);
return 1;
}
@@ -2316,7 +2318,7 @@ int test_clear_page_writeback(struct page *page)
memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2351,7 +2353,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2405,12 +2407,7 @@ EXPORT_SYMBOL(mapping_tagged);
*/
void wait_for_stable_page(struct page *page)
{
- struct address_space *mapping = page_mapping(page);
- struct backing_dev_info *bdi = mapping->backing_dev_info;
-
- if (!bdi_cap_stable_pages_required(bdi))
- return;
-
- wait_on_page_writeback(page);
+ if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
+ wait_on_page_writeback(page);
}
EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/readahead.c b/mm/readahead.c
index 17b9172ec37f..935675844b2e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -27,7 +27,7 @@
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
- ra->ra_pages = mapping->backing_dev_info->ra_pages;
+ ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
ra->prev_pos = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping,
/*
* Defer asynchronous read-ahead on IO congestion.
*/
- if (bdi_read_congested(mapping->backing_dev_info))
+ if (bdi_read_congested(inode_to_bdi(mapping->host)))
return;
/* do read-ahead */
diff --git a/mm/shmem.c b/mm/shmem.c
index 864c878401e6..a63031fa3e0c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -191,11 +191,6 @@ static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
-static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
-
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
@@ -765,11 +760,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
goto redirty;
/*
- * shmem_backing_dev_info's capabilities prevent regular writeback or
- * sync from ever calling shmem_writepage; but a stacking filesystem
- * might use ->writepage of its underlying filesystem, in which case
- * tmpfs should write out to swap only in response to memory pressure,
- * and not for the writeback threads or sync.
+ * Our capabilities prevent regular writeback or sync from ever calling
+ * shmem_writepage; but a stacking filesystem might use ->writepage of
+ * its underlying filesystem, in which case tmpfs should write out to
+ * swap only in response to memory pressure, and not for the writeback
+ * threads or sync.
*/
if (!wbc->for_reclaim) {
WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
@@ -1415,7 +1410,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
- inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_generation = get_seconds();
info = SHMEM_I(inode);
@@ -1461,7 +1455,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
bool shmem_mapping(struct address_space *mapping)
{
- return mapping->backing_dev_info == &shmem_backing_dev_info;
+ return mapping->host->i_sb->s_op == &shmem_ops;
}
#ifdef CONFIG_TMPFS
@@ -3225,10 +3219,6 @@ int __init shmem_init(void)
if (shmem_inode_cachep)
return 0;
- error = bdi_init(&shmem_backing_dev_info);
- if (error)
- goto out4;
-
error = shmem_init_inodecache();
if (error)
goto out3;
@@ -3252,8 +3242,6 @@ out1:
out2:
shmem_destroy_inodecache();
out3:
- bdi_destroy(&shmem_backing_dev_info);
-out4:
shm_mnt = ERR_PTR(error);
return error;
}
diff --git a/mm/swap.c b/mm/swap.c
index 5b3087228b99..cd3a5e64cea9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1138,8 +1138,6 @@ void __init swap_setup(void)
#ifdef CONFIG_SWAP
int i;
- if (bdi_init(swapper_spaces[0].backing_dev_info))
- panic("Failed to init swap bdi");
for (i = 0; i < MAX_SWAPFILES; i++)
spin_lock_init(&swapper_spaces[i].tree_lock);
#endif
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9711342987a0..405923f77334 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -32,17 +32,11 @@ static const struct address_space_operations swap_aops = {
#endif
};
-static struct backing_dev_info swap_backing_dev_info = {
- .name = "swap",
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
-
struct address_space swapper_spaces[MAX_SWAPFILES] = {
[0 ... MAX_SWAPFILES - 1] = {
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
.i_mmap_writable = ATOMIC_INIT(0),
.a_ops = &swap_aops,
- .backing_dev_info = &swap_backing_dev_info,
}
};
diff --git a/mm/truncate.c b/mm/truncate.c
index f1e4d6052369..ddec5a5966d7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -112,7 +112,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
struct address_space *mapping = page->mapping;
if (mapping && mapping_cap_account_dirty(mapping)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(mapping->backing_dev_info,
+ dec_bdi_stat(inode_to_bdi(mapping->host),
BDI_RECLAIMABLE);
if (account_size)
task_io_account_cancelled_write(account_size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 803886b8e353..5e8eadd71bac 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -538,7 +538,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(mapping->backing_dev_info, sc))
+ if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
@@ -917,7 +917,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
mapping = page_mapping(page);
if (((dirty || writeback) && mapping &&
- bdi_write_congested(mapping->backing_dev_info)) ||
+ bdi_write_congested(inode_to_bdi(mapping->host))) ||
(writeback && PageReclaim(page)))
nr_congested++;
diff --git a/security/security.c b/security/security.c
index 18b35c63fc0c..a0442b20f001 100644
--- a/security/security.c
+++ b/security/security.c
@@ -726,16 +726,15 @@ static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
return prot | PROT_EXEC;
/*
* ditto if it's not on noexec mount, except that on !MMU we need
- * BDI_CAP_EXEC_MMAP (== VM_MAYEXEC) in this case
+ * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
*/
if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) {
#ifndef CONFIG_MMU
- unsigned long caps = 0;
- struct address_space *mapping = file->f_mapping;
- if (mapping && mapping->backing_dev_info)
- caps = mapping->backing_dev_info->capabilities;
- if (!(caps & BDI_CAP_EXEC_MAP))
- return prot;
+ if (file->f_op->mmap_capabilities) {
+ unsigned caps = file->f_op->mmap_capabilities(file);
+ if (!(caps & NOMMU_MAP_EXEC))
+ return prot;
+ }
#endif
return prot | PROT_EXEC;
}