From 3d5fe03a3ea013060ebba2a811aeb0f23f56aefa Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 14 Jan 2016 15:22:26 -0800 Subject: zram/zcomp: use GFP_NOIO to allocate streams We can end up allocating a new compression stream with GFP_KERNEL from within the IO path, which may result is nested (recursive) IO operations. That can introduce problems if the IO path in question is a reclaimer, holding some locks that will deadlock nested IOs. Allocate streams and working memory using GFP_NOIO flag, forbidding recursive IO and FS operations. An example: inconsistent {IN-RECLAIM_FS-W} -> {RECLAIM_FS-ON-W} usage. git/20158 [HC0[0]:SC0[0]:HE1:SE1] takes: (jbd2_handle){+.+.?.}, at: start_this_handle+0x4ca/0x555 {IN-RECLAIM_FS-W} state was registered at: __lock_acquire+0x8da/0x117b lock_acquire+0x10c/0x1a7 start_this_handle+0x52d/0x555 jbd2__journal_start+0xb4/0x237 __ext4_journal_start_sb+0x108/0x17e ext4_dirty_inode+0x32/0x61 __mark_inode_dirty+0x16b/0x60c iput+0x11e/0x274 __dentry_kill+0x148/0x1b8 shrink_dentry_list+0x274/0x44a prune_dcache_sb+0x4a/0x55 super_cache_scan+0xfc/0x176 shrink_slab.part.14.constprop.25+0x2a2/0x4d3 shrink_zone+0x74/0x140 kswapd+0x6b7/0x930 kthread+0x107/0x10f ret_from_fork+0x3f/0x70 irq event stamp: 138297 hardirqs last enabled at (138297): debug_check_no_locks_freed+0x113/0x12f hardirqs last disabled at (138296): debug_check_no_locks_freed+0x33/0x12f softirqs last enabled at (137818): __do_softirq+0x2d3/0x3e9 softirqs last disabled at (137813): irq_exit+0x41/0x95 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(jbd2_handle); lock(jbd2_handle); *** DEADLOCK *** 5 locks held by git/20158: #0: (sb_writers#7){.+.+.+}, at: [] mnt_want_write+0x24/0x4b #1: (&type->i_mutex_dir_key#2/1){+.+.+.}, at: [] lock_rename+0xd9/0xe3 #2: (&sb->s_type->i_mutex_key#11){+.+.+.}, at: [] lock_two_nondirectories+0x3f/0x6b #3: (&sb->s_type->i_mutex_key#11/4){+.+.+.}, at: [] lock_two_nondirectories+0x66/0x6b #4: (jbd2_handle){+.+.?.}, at: [] start_this_handle+0x4ca/0x555 stack backtrace: CPU: 2 PID: 20158 Comm: git Not tainted 4.1.0-rc7-next-20150615-dbg-00016-g8bdf555-dirty #211 Call Trace: dump_stack+0x4c/0x6e mark_lock+0x384/0x56d mark_held_locks+0x5f/0x76 lockdep_trace_alloc+0xb2/0xb5 kmem_cache_alloc_trace+0x32/0x1e2 zcomp_strm_alloc+0x25/0x73 [zram] zcomp_strm_multi_find+0xe7/0x173 [zram] zcomp_strm_find+0xc/0xe [zram] zram_bvec_rw+0x2ca/0x7e0 [zram] zram_make_request+0x1fa/0x301 [zram] generic_make_request+0x9c/0xdb submit_bio+0xf7/0x120 ext4_io_submit+0x2e/0x43 ext4_bio_write_page+0x1b7/0x300 mpage_submit_page+0x60/0x77 mpage_map_and_submit_buffers+0x10f/0x21d ext4_writepages+0xc8c/0xe1b do_writepages+0x23/0x2c __filemap_fdatawrite_range+0x84/0x8b filemap_flush+0x1c/0x1e ext4_alloc_da_blocks+0xb8/0x117 ext4_rename+0x132/0x6dc ? mark_held_locks+0x5f/0x76 ext4_rename2+0x29/0x2b vfs_rename+0x540/0x636 SyS_renameat2+0x359/0x44d SyS_rename+0x1e/0x20 entry_SYSCALL_64_fastpath+0x12/0x6f [minchan@kernel.org: add stable mark] Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Kyeongdon Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 4 ++-- drivers/block/zram/zcomp_lz4.c | 2 +- drivers/block/zram/zcomp_lzo.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 5cb13ca3a3ac..c53617752b93 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -76,7 +76,7 @@ static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) */ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) { - struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL); + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_NOIO); if (!zstrm) return NULL; @@ -85,7 +85,7 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) * allocate 2 pages. 1 for compressed data, plus 1 extra for the * case when compressed size is larger than the original one */ - zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + zstrm->buffer = (void *)__get_free_pages(GFP_NOIO | __GFP_ZERO, 1); if (!zstrm->private || !zstrm->buffer) { zcomp_strm_free(comp, zstrm); zstrm = NULL; diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c index f2afb7e988c3..ee44b51130a4 100644 --- a/drivers/block/zram/zcomp_lz4.c +++ b/drivers/block/zram/zcomp_lz4.c @@ -15,7 +15,7 @@ static void *zcomp_lz4_create(void) { - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); + return kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO); } static void zcomp_lz4_destroy(void *private) diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c index da1bc47d588e..683ce049e070 100644 --- a/drivers/block/zram/zcomp_lzo.c +++ b/drivers/block/zram/zcomp_lzo.c @@ -15,7 +15,7 @@ static void *lzo_create(void) { - return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + return kzalloc(LZO1X_MEM_COMPRESS, GFP_NOIO); } static void lzo_destroy(void *private) -- cgit v1.2.3 From d913897abace843bba20249f3190167f7895e9c3 Mon Sep 17 00:00:00 2001 From: Kyeongdon Kim Date: Thu, 14 Jan 2016 15:22:29 -0800 Subject: zram: try vmalloc() after kmalloc() When we're using LZ4 multi compression streams for zram swap, we found out page allocation failure message in system running test. That was not only once, but a few(2 - 5 times per test). Also, some failure cases were continually occurring to try allocation order 3. In order to make parallel compression private data, we should call kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order 2/3 size memory to allocate in that time, page allocation fails. This patch makes to use vmalloc() as fallback of kmalloc(), this prevents page alloc failure warning. After using this, we never found warning message in running test, also It could reduce process startup latency about 60-120ms in each case. For reference a call trace : Binder_1: page allocation failure: order:3, mode:0x10c0d0 CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty #20 Call trace: dump_backtrace+0x0/0x270 show_stack+0x10/0x1c dump_stack+0x1c/0x28 warn_alloc_failed+0xfc/0x11c __alloc_pages_nodemask+0x724/0x7f0 __get_free_pages+0x14/0x5c kmalloc_order_trace+0x38/0xd8 zcomp_lz4_create+0x2c/0x38 zcomp_strm_alloc+0x34/0x78 zcomp_strm_multi_find+0x124/0x1ec zcomp_strm_find+0xc/0x18 zram_bvec_rw+0x2fc/0x780 zram_make_request+0x25c/0x2d4 generic_make_request+0x80/0xbc submit_bio+0xa4/0x15c __swap_writepage+0x218/0x230 swap_writepage+0x3c/0x4c shrink_page_list+0x51c/0x8d0 shrink_inactive_list+0x3f8/0x60c shrink_lruvec+0x33c/0x4cc shrink_zone+0x3c/0x100 try_to_free_pages+0x2b8/0x54c __alloc_pages_nodemask+0x514/0x7f0 __get_free_pages+0x14/0x5c proc_info_read+0x50/0xe4 vfs_read+0xa0/0x12c SyS_read+0x44/0x74 DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB [minchan@kernel.org: change vmalloc gfp and adding comment about gfp] [sergey.senozhatsky@gmail.com: tweak comments and styles] Signed-off-by: Kyeongdon Kim Signed-off-by: Minchan Kim Acked-by: Sergey Senozhatsky Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp_lz4.c | 23 +++++++++++++++++++++-- drivers/block/zram/zcomp_lzo.c | 23 +++++++++++++++++++++-- 2 files changed, 42 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c index ee44b51130a4..dd6083124276 100644 --- a/drivers/block/zram/zcomp_lz4.c +++ b/drivers/block/zram/zcomp_lz4.c @@ -10,17 +10,36 @@ #include #include #include +#include +#include #include "zcomp_lz4.h" static void *zcomp_lz4_create(void) { - return kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO); + void *ret; + + /* + * This function can be called in swapout/fs write path + * so we can't use GFP_FS|IO. And it assumes we already + * have at least one stream in zram initialization so we + * don't do best effort to allocate more stream in here. + * A default stream will work well without further multiple + * streams. That's why we use NORETRY | NOWARN. + */ + ret = kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY | + __GFP_NOWARN); + if (!ret) + ret = __vmalloc(LZ4_MEM_COMPRESS, + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN | + __GFP_ZERO | __GFP_HIGHMEM, + PAGE_KERNEL); + return ret; } static void zcomp_lz4_destroy(void *private) { - kfree(private); + kvfree(private); } static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c index 683ce049e070..edc549920fa0 100644 --- a/drivers/block/zram/zcomp_lzo.c +++ b/drivers/block/zram/zcomp_lzo.c @@ -10,17 +10,36 @@ #include #include #include +#include +#include #include "zcomp_lzo.h" static void *lzo_create(void) { - return kzalloc(LZO1X_MEM_COMPRESS, GFP_NOIO); + void *ret; + + /* + * This function can be called in swapout/fs write path + * so we can't use GFP_FS|IO. And it assumes we already + * have at least one stream in zram initialization so we + * don't do best effort to allocate more stream in here. + * A default stream will work well without further multiple + * streams. That's why we use NORETRY | NOWARN. + */ + ret = kzalloc(LZO1X_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY | + __GFP_NOWARN); + if (!ret) + ret = __vmalloc(LZO1X_MEM_COMPRESS, + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN | + __GFP_ZERO | __GFP_HIGHMEM, + PAGE_KERNEL); + return ret; } static void lzo_destroy(void *private) { - kfree(private); + kvfree(private); } static int lzo_compress(const unsigned char *src, unsigned char *dst, -- cgit v1.2.3 From 75d8947a36d0c9aedd69118d1f14bf424005c7c2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 14 Jan 2016 15:22:32 -0800 Subject: zram: pass gfp from zcomp frontend to backend Each zcomp backend uses own gfp flag but it's pointless because the context they could be called is driven by upper layer(ie, zcomp frontend). As well, zcomp frondend could call them in different context. One context(ie, zram init part) is it should be better to make sure successful allocation other context(ie, further stream allocation part for accelarating I/O speed) is just optional so let's pass gfp down from driver (ie, zcomp frontend) like normal MM convention. [sergey.senozhatsky@gmail.com: add missing __vmalloc zero and highmem gfps] Signed-off-by: Minchan Kim Signed-off-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 24 ++++++++++++++++-------- drivers/block/zram/zcomp.h | 2 +- drivers/block/zram/zcomp_lz4.c | 16 +++------------- drivers/block/zram/zcomp_lzo.c | 16 +++------------- 4 files changed, 23 insertions(+), 35 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index c53617752b93..3ef42e563bb5 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -74,18 +74,18 @@ static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) * allocate new zcomp_strm structure with ->private initialized by * backend, return NULL on error */ -static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) { - struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_NOIO); + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags); if (!zstrm) return NULL; - zstrm->private = comp->backend->create(); + zstrm->private = comp->backend->create(flags); /* * allocate 2 pages. 1 for compressed data, plus 1 extra for the * case when compressed size is larger than the original one */ - zstrm->buffer = (void *)__get_free_pages(GFP_NOIO | __GFP_ZERO, 1); + zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1); if (!zstrm->private || !zstrm->buffer) { zcomp_strm_free(comp, zstrm); zstrm = NULL; @@ -120,8 +120,16 @@ static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp) /* allocate new zstrm stream */ zs->avail_strm++; spin_unlock(&zs->strm_lock); - - zstrm = zcomp_strm_alloc(comp); + /* + * This function can be called in swapout/fs write path + * so we can't use GFP_FS|IO. And it assumes we already + * have at least one stream in zram initialization so we + * don't do best effort to allocate more stream in here. + * A default stream will work well without further multiple + * streams. That's why we use NORETRY | NOWARN. + */ + zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY | + __GFP_NOWARN); if (!zstrm) { spin_lock(&zs->strm_lock); zs->avail_strm--; @@ -209,7 +217,7 @@ static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) zs->max_strm = max_strm; zs->avail_strm = 1; - zstrm = zcomp_strm_alloc(comp); + zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); if (!zstrm) { kfree(zs); return -ENOMEM; @@ -259,7 +267,7 @@ static int zcomp_strm_single_create(struct zcomp *comp) comp->stream = zs; mutex_init(&zs->strm_lock); - zs->zstrm = zcomp_strm_alloc(comp); + zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); if (!zs->zstrm) { kfree(zs); return -ENOMEM; diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 46e2b9f8f1f0..b7d2a4bcae54 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -33,7 +33,7 @@ struct zcomp_backend { int (*decompress)(const unsigned char *src, size_t src_len, unsigned char *dst); - void *(*create)(void); + void *(*create)(gfp_t flags); void (*destroy)(void *private); const char *name; diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c index dd6083124276..dc2338d5258c 100644 --- a/drivers/block/zram/zcomp_lz4.c +++ b/drivers/block/zram/zcomp_lz4.c @@ -15,24 +15,14 @@ #include "zcomp_lz4.h" -static void *zcomp_lz4_create(void) +static void *zcomp_lz4_create(gfp_t flags) { void *ret; - /* - * This function can be called in swapout/fs write path - * so we can't use GFP_FS|IO. And it assumes we already - * have at least one stream in zram initialization so we - * don't do best effort to allocate more stream in here. - * A default stream will work well without further multiple - * streams. That's why we use NORETRY | NOWARN. - */ - ret = kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY | - __GFP_NOWARN); + ret = kzalloc(LZ4_MEM_COMPRESS, flags); if (!ret) ret = __vmalloc(LZ4_MEM_COMPRESS, - GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN | - __GFP_ZERO | __GFP_HIGHMEM, + flags | __GFP_ZERO | __GFP_HIGHMEM, PAGE_KERNEL); return ret; } diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c index edc549920fa0..0ab6fce8abe4 100644 --- a/drivers/block/zram/zcomp_lzo.c +++ b/drivers/block/zram/zcomp_lzo.c @@ -15,24 +15,14 @@ #include "zcomp_lzo.h" -static void *lzo_create(void) +static void *lzo_create(gfp_t flags) { void *ret; - /* - * This function can be called in swapout/fs write path - * so we can't use GFP_FS|IO. And it assumes we already - * have at least one stream in zram initialization so we - * don't do best effort to allocate more stream in here. - * A default stream will work well without further multiple - * streams. That's why we use NORETRY | NOWARN. - */ - ret = kzalloc(LZO1X_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY | - __GFP_NOWARN); + ret = kzalloc(LZO1X_MEM_COMPRESS, flags); if (!ret) ret = __vmalloc(LZO1X_MEM_COMPRESS, - GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN | - __GFP_ZERO | __GFP_HIGHMEM, + flags | __GFP_ZERO | __GFP_HIGHMEM, PAGE_KERNEL); return ret; } -- cgit v1.2.3 From e02d238c9852a91b30da9ea32ce36d1416cdc683 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 14 Jan 2016 15:22:35 -0800 Subject: zram/zcomp: do not zero out zcomp private pages Do not __GFP_ZERO allocated zcomp ->private pages. We keep allocated streams around and use them for read/write requests, so we supply a zeroed out ->private to compression algorithm as a scratch buffer only once -- the first time we use that stream. For the rest of IO requests served by this stream ->private usually contains some temporarily data from the previous requests. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp_lz4.c | 4 ++-- drivers/block/zram/zcomp_lzo.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c index dc2338d5258c..0110086accba 100644 --- a/drivers/block/zram/zcomp_lz4.c +++ b/drivers/block/zram/zcomp_lz4.c @@ -19,10 +19,10 @@ static void *zcomp_lz4_create(gfp_t flags) { void *ret; - ret = kzalloc(LZ4_MEM_COMPRESS, flags); + ret = kmalloc(LZ4_MEM_COMPRESS, flags); if (!ret) ret = __vmalloc(LZ4_MEM_COMPRESS, - flags | __GFP_ZERO | __GFP_HIGHMEM, + flags | __GFP_HIGHMEM, PAGE_KERNEL); return ret; } diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c index 0ab6fce8abe4..ed7a1f0549ec 100644 --- a/drivers/block/zram/zcomp_lzo.c +++ b/drivers/block/zram/zcomp_lzo.c @@ -19,10 +19,10 @@ static void *lzo_create(gfp_t flags) { void *ret; - ret = kzalloc(LZO1X_MEM_COMPRESS, flags); + ret = kmalloc(LZO1X_MEM_COMPRESS, flags); if (!ret) ret = __vmalloc(LZO1X_MEM_COMPRESS, - flags | __GFP_ZERO | __GFP_HIGHMEM, + flags | __GFP_HIGHMEM, PAGE_KERNEL); return ret; } -- cgit v1.2.3