summaryrefslogtreecommitdiff
path: root/mm/vmalloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r--mm/vmalloc.c310
1 files changed, 212 insertions, 98 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 31ff782d368b..9683573f1225 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -33,11 +33,11 @@
#include <linux/compiler.h>
#include <linux/memcontrol.h>
#include <linux/llist.h>
+#include <linux/uio.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
#include <linux/pgtable.h>
-#include <linux/uaccess.h>
#include <linux/hugetlb.h>
#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
@@ -1915,6 +1915,13 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
struct vmap_block_queue {
spinlock_t lock;
struct list_head free;
+
+ /*
+ * An xarray requires an extra memory dynamically to
+ * be allocated. If it is an issue, we can use rb-tree
+ * instead.
+ */
+ struct xarray vmap_blocks;
};
struct vmap_block {
@@ -1932,11 +1939,48 @@ struct vmap_block {
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
/*
- * XArray of vmap blocks, indexed by address, to quickly find a vmap block
- * in the free path. Could get rid of this if we change the API to return a
- * "cookie" from alloc, to be passed to free. But no big deal yet.
+ * In order to fast access to any "vmap_block" associated with a
+ * specific address, we use a hash.
+ *
+ * A per-cpu vmap_block_queue is used in both ways, to serialize
+ * an access to free block chains among CPUs(alloc path) and it
+ * also acts as a vmap_block hash(alloc/free paths). It means we
+ * overload it, since we already have the per-cpu array which is
+ * used as a hash table. When used as a hash a 'cpu' passed to
+ * per_cpu() is not actually a CPU but rather a hash index.
+ *
+ * A hash function is addr_to_vb_xa() which hashes any address
+ * to a specific index(in a hash) it belongs to. This then uses a
+ * per_cpu() macro to access an array with generated index.
+ *
+ * An example:
+ *
+ * CPU_1 CPU_2 CPU_0
+ * | | |
+ * V V V
+ * 0 10 20 30 40 50 60
+ * |------|------|------|------|------|------|...<vmap address space>
+ * CPU0 CPU1 CPU2 CPU0 CPU1 CPU2
+ *
+ * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
+ * it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
+ *
+ * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
+ * it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
+ *
+ * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
+ * it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
+ *
+ * This technique almost always avoids lock contention on insert/remove,
+ * however xarray spinlocks protect against any contention that remains.
*/
-static DEFINE_XARRAY(vmap_blocks);
+static struct xarray *
+addr_to_vb_xa(unsigned long addr)
+{
+ int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus();
+
+ return &per_cpu(vmap_block_queue, index).vmap_blocks;
+}
/*
* We should probably have a fallback mechanism to allocate virtual memory
@@ -1974,6 +2018,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
struct vmap_block_queue *vbq;
struct vmap_block *vb;
struct vmap_area *va;
+ struct xarray *xa;
unsigned long vb_idx;
int node, err;
void *vaddr;
@@ -2007,8 +2052,9 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
bitmap_set(vb->used_map, 0, (1UL << order));
INIT_LIST_HEAD(&vb->free_list);
+ xa = addr_to_vb_xa(va->va_start);
vb_idx = addr_to_vb_idx(va->va_start);
- err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
+ err = xa_insert(xa, vb_idx, vb, gfp_mask);
if (err) {
kfree(vb);
free_vmap_area(va);
@@ -2026,8 +2072,10 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
static void free_vmap_block(struct vmap_block *vb)
{
struct vmap_block *tmp;
+ struct xarray *xa;
- tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
+ xa = addr_to_vb_xa(vb->va->va_start);
+ tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
spin_lock(&vmap_area_lock);
@@ -2139,6 +2187,7 @@ static void vb_free(unsigned long addr, unsigned long size)
unsigned long offset;
unsigned int order;
struct vmap_block *vb;
+ struct xarray *xa;
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -2147,7 +2196,10 @@ static void vb_free(unsigned long addr, unsigned long size)
order = get_order(size);
offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
- vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
+
+ xa = addr_to_vb_xa(addr);
+ vb = xa_load(xa, addr_to_vb_idx(addr));
+
spin_lock(&vb->lock);
bitmap_clear(vb->used_map, offset, (1UL << order));
spin_unlock(&vb->lock);
@@ -2743,7 +2795,7 @@ void vfree(const void *addr)
* High-order allocs for huge vmallocs are split, so
* can be freed as an array of order-0 allocations
*/
- __free_pages(page, 0);
+ __free_page(page);
cond_resched();
}
atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
@@ -3194,7 +3246,7 @@ again:
* pages backing VM_ALLOC mapping. Memory is instead
* poisoned and zeroed by kasan_unpoison_vmalloc().
*/
- gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO;
+ gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
}
/* Take note that the mapping is PAGE_KERNEL. */
@@ -3448,113 +3500,163 @@ void *vmalloc_32_user(unsigned long size)
EXPORT_SYMBOL(vmalloc_32_user);
/*
- * small helper routine , copy contents to buf from addr.
- * If the page is not present, fill zero.
+ * Atomically zero bytes in the iterator.
+ *
+ * Returns the number of zeroed bytes.
*/
+static size_t zero_iter(struct iov_iter *iter, size_t count)
+{
+ size_t remains = count;
-static int aligned_vread(char *buf, char *addr, unsigned long count)
+ while (remains > 0) {
+ size_t num, copied;
+
+ num = remains < PAGE_SIZE ? remains : PAGE_SIZE;
+ copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
+ remains -= copied;
+
+ if (copied < num)
+ break;
+ }
+
+ return count - remains;
+}
+
+/*
+ * small helper routine, copy contents to iter from addr.
+ * If the page is not present, fill zero.
+ *
+ * Returns the number of copied bytes.
+ */
+static size_t aligned_vread_iter(struct iov_iter *iter,
+ const char *addr, size_t count)
{
- struct page *p;
- int copied = 0;
+ size_t remains = count;
+ struct page *page;
- while (count) {
+ while (remains > 0) {
unsigned long offset, length;
+ size_t copied = 0;
offset = offset_in_page(addr);
length = PAGE_SIZE - offset;
- if (length > count)
- length = count;
- p = vmalloc_to_page(addr);
+ if (length > remains)
+ length = remains;
+ page = vmalloc_to_page(addr);
/*
- * To do safe access to this _mapped_ area, we need
- * lock. But adding lock here means that we need to add
- * overhead of vmalloc()/vfree() calls for this _debug_
- * interface, rarely used. Instead of that, we'll use
- * kmap() and get small overhead in this access function.
+ * To do safe access to this _mapped_ area, we need lock. But
+ * adding lock here means that we need to add overhead of
+ * vmalloc()/vfree() calls for this _debug_ interface, rarely
+ * used. Instead of that, we'll use an local mapping via
+ * copy_page_to_iter_nofault() and accept a small overhead in
+ * this access function.
*/
- if (p) {
- /* We can expect USER0 is not used -- see vread() */
- void *map = kmap_atomic(p);
- memcpy(buf, map + offset, length);
- kunmap_atomic(map);
- } else
- memset(buf, 0, length);
+ if (page)
+ copied = copy_page_to_iter_nofault(page, offset,
+ length, iter);
+ else
+ copied = zero_iter(iter, length);
- addr += length;
- buf += length;
- copied += length;
- count -= length;
+ addr += copied;
+ remains -= copied;
+
+ if (copied != length)
+ break;
}
- return copied;
+
+ return count - remains;
}
-static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
+/*
+ * Read from a vm_map_ram region of memory.
+ *
+ * Returns the number of copied bytes.
+ */
+static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
+ size_t count, unsigned long flags)
{
char *start;
struct vmap_block *vb;
+ struct xarray *xa;
unsigned long offset;
- unsigned int rs, re, n;
+ unsigned int rs, re;
+ size_t remains, n;
/*
* If it's area created by vm_map_ram() interface directly, but
* not further subdividing and delegating management to vmap_block,
* handle it here.
*/
- if (!(flags & VMAP_BLOCK)) {
- aligned_vread(buf, addr, count);
- return;
- }
+ if (!(flags & VMAP_BLOCK))
+ return aligned_vread_iter(iter, addr, count);
+
+ remains = count;
/*
* Area is split into regions and tracked with vmap_block, read out
* each region and zero fill the hole between regions.
*/
- vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
+ xa = addr_to_vb_xa((unsigned long) addr);
+ vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
if (!vb)
- goto finished;
+ goto finished_zero;
spin_lock(&vb->lock);
if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
spin_unlock(&vb->lock);
- goto finished;
+ goto finished_zero;
}
+
for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
- if (!count)
- break;
+ size_t copied;
+
+ if (remains == 0)
+ goto finished;
+
start = vmap_block_vaddr(vb->va->va_start, rs);
- while (addr < start) {
- if (count == 0)
- goto unlock;
- *buf = '\0';
- buf++;
- addr++;
- count--;
+
+ if (addr < start) {
+ size_t to_zero = min_t(size_t, start - addr, remains);
+ size_t zeroed = zero_iter(iter, to_zero);
+
+ addr += zeroed;
+ remains -= zeroed;
+
+ if (remains == 0 || zeroed != to_zero)
+ goto finished;
}
+
/*it could start reading from the middle of used region*/
offset = offset_in_page(addr);
n = ((re - rs + 1) << PAGE_SHIFT) - offset;
- if (n > count)
- n = count;
- aligned_vread(buf, start+offset, n);
+ if (n > remains)
+ n = remains;
- buf += n;
- addr += n;
- count -= n;
+ copied = aligned_vread_iter(iter, start + offset, n);
+
+ addr += copied;
+ remains -= copied;
+
+ if (copied != n)
+ goto finished;
}
-unlock:
+
spin_unlock(&vb->lock);
-finished:
+finished_zero:
/* zero-fill the left dirty or free regions */
- if (count)
- memset(buf, 0, count);
+ return count - remains + zero_iter(iter, remains);
+finished:
+ /* We couldn't copy/zero everything */
+ spin_unlock(&vb->lock);
+ return count - remains;
}
/**
- * vread() - read vmalloc area in a safe way.
- * @buf: buffer for reading data
- * @addr: vm address.
- * @count: number of bytes to be read.
+ * vread_iter() - read vmalloc area in a safe way to an iterator.
+ * @iter: the iterator to which data should be written.
+ * @addr: vm address.
+ * @count: number of bytes to be read.
*
* This function checks that addr is a valid vmalloc'ed area, and
* copy data from that area to a given buffer. If the given memory range
@@ -3574,13 +3676,12 @@ finished:
* (same number as @count) or %0 if [addr...addr+count) doesn't
* include any intersection with valid vmalloc area
*/
-long vread(char *buf, char *addr, unsigned long count)
+long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
struct vmap_area *va;
struct vm_struct *vm;
- char *vaddr, *buf_start = buf;
- unsigned long buflen = count;
- unsigned long n, size, flags;
+ char *vaddr;
+ size_t n, size, flags, remains;
addr = kasan_reset_tag(addr);
@@ -3588,18 +3689,22 @@ long vread(char *buf, char *addr, unsigned long count)
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
+ remains = count;
+
spin_lock(&vmap_area_lock);
va = find_vmap_area_exceed_addr((unsigned long)addr);
if (!va)
- goto finished;
+ goto finished_zero;
/* no intersects with alive vmap_area */
- if ((unsigned long)addr + count <= va->va_start)
- goto finished;
+ if ((unsigned long)addr + remains <= va->va_start)
+ goto finished_zero;
list_for_each_entry_from(va, &vmap_area_list, list) {
- if (!count)
- break;
+ size_t copied;
+
+ if (remains == 0)
+ goto finished;
vm = va->vm;
flags = va->flags & VMAP_FLAGS_MASK;
@@ -3614,6 +3719,7 @@ long vread(char *buf, char *addr, unsigned long count)
if (vm && (vm->flags & VM_UNINITIALIZED))
continue;
+
/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
smp_rmb();
@@ -3622,38 +3728,45 @@ long vread(char *buf, char *addr, unsigned long count)
if (addr >= vaddr + size)
continue;
- while (addr < vaddr) {
- if (count == 0)
+
+ if (addr < vaddr) {
+ size_t to_zero = min_t(size_t, vaddr - addr, remains);
+ size_t zeroed = zero_iter(iter, to_zero);
+
+ addr += zeroed;
+ remains -= zeroed;
+
+ if (remains == 0 || zeroed != to_zero)
goto finished;
- *buf = '\0';
- buf++;
- addr++;
- count--;
}
+
n = vaddr + size - addr;
- if (n > count)
- n = count;
+ if (n > remains)
+ n = remains;
if (flags & VMAP_RAM)
- vmap_ram_vread(buf, addr, n, flags);
+ copied = vmap_ram_vread_iter(iter, addr, n, flags);
else if (!(vm->flags & VM_IOREMAP))
- aligned_vread(buf, addr, n);
+ copied = aligned_vread_iter(iter, addr, n);
else /* IOREMAP area is treated as memory hole */
- memset(buf, 0, n);
- buf += n;
- addr += n;
- count -= n;
+ copied = zero_iter(iter, n);
+
+ addr += copied;
+ remains -= copied;
+
+ if (copied != n)
+ goto finished;
}
-finished:
- spin_unlock(&vmap_area_lock);
- if (buf == buf_start)
- return 0;
+finished_zero:
+ spin_unlock(&vmap_area_lock);
/* zero-fill memory holes */
- if (buf != buf_start + buflen)
- memset(buf, 0, buflen - (buf - buf_start));
+ return count - remains + zero_iter(iter, remains);
+finished:
+ /* Nothing remains, or We couldn't copy/zero everything. */
+ spin_unlock(&vmap_area_lock);
- return buflen;
+ return count - remains;
}
/**
@@ -4278,6 +4391,7 @@ void __init vmalloc_init(void)
p = &per_cpu(vfree_deferred, i);
init_llist_head(&p->list);
INIT_WORK(&p->wq, delayed_vfree_work);
+ xa_init(&vbq->vmap_blocks);
}
/* Import existing vmlist entries. */