From fc37a3b8b4388e73e8e3525556d9f1feeb232bb9 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 30 Jun 2021 18:57:09 -0700 Subject: ipc sem: use kvmalloc for sem_undo allocation Patch series "ipc: allocations cleanup", v2. Some ipc objects use the wrong allocation functions: small objects can use kmalloc(), and vice versa, potentially large objects can use kmalloc(). This patch (of 2): Size of sem_undo can exceed one page and with the maximum possible nsems = 32000 it can grow up to 64Kb. Let's switch its allocation to kvmalloc to avoid user-triggered disruptive actions like OOM killer in case of high-order memory shortage. User triggerable high order allocations are quite a problem on heavily fragmented systems. They can be a DoS vector. Link: https://lkml.kernel.org/r/ebc3ac79-3190-520d-81ce-22ad194986ec@virtuozzo.com Link: https://lkml.kernel.org/r/a6354fd9-2d55-2e63-dd4d-fa7dc1d11134@virtuozzo.com Signed-off-by: Vasily Averin Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Alexey Dobriyan Cc: Davidlohr Bueso Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Johannes Weiner Cc: Manfred Spraul Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'ipc') diff --git a/ipc/sem.c b/ipc/sem.c index bf534c74293e..3a58188733d8 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1154,7 +1154,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) un->semid = -1; list_del_rcu(&un->list_proc); spin_unlock(&un->ulp->lock); - kfree_rcu(un, rcu); + kvfree_rcu(un, rcu); } /* Wake up all pending processes and let them fail with EIDRM. */ @@ -1937,7 +1937,8 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) rcu_read_unlock(); /* step 2: allocate new undo structure */ - new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); + new = kvzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, + GFP_KERNEL); if (!new) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return ERR_PTR(-ENOMEM); @@ -1949,7 +1950,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) if (!ipc_valid_object(&sma->sem_perm)) { sem_unlock(sma, -1); rcu_read_unlock(); - kfree(new); + kvfree(new); un = ERR_PTR(-EIDRM); goto out; } @@ -1960,7 +1961,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) */ un = lookup_undo(ulp, semid); if (un) { - kfree(new); + kvfree(new); goto success; } /* step 5: initialize & link new undo structure */ @@ -2420,7 +2421,7 @@ void exit_sem(struct task_struct *tsk) rcu_read_unlock(); wake_up_q(&wake_q); - kfree_rcu(un, rcu); + kvfree_rcu(un, rcu); } kfree(ulp); } -- cgit v1.2.3 From bc8136a543aa839a848b49af5e101ac6de5f6b27 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 30 Jun 2021 18:57:12 -0700 Subject: ipc: use kmalloc for msg_queue and shmid_kernel msg_queue and shmid_kernel are quite small objects, no need to use kvmalloc for them. mhocko@: "Both of them are 256B on most 64b systems." Previously these objects was allocated via ipc_alloc/ipc_rcu_alloc(), common function for several ipc objects. It had kvmalloc call inside(). Later, this function went away and was finally replaced by direct kvmalloc call, and now we can use more suitable kmalloc/kfree for them. Link: https://lkml.kernel.org/r/0d0b6c9b-8af3-29d8-34e2-a565c53780f3@virtuozzo.com Reported-by: Alexey Dobriyan Signed-off-by: Vasily Averin Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 6 +++--- ipc/shm.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'ipc') diff --git a/ipc/msg.c b/ipc/msg.c index 6e6c8e0c9380..6810276d6bb9 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -130,7 +130,7 @@ static void msg_rcu_free(struct rcu_head *head) struct msg_queue *msq = container_of(p, struct msg_queue, q_perm); security_msg_queue_free(&msq->q_perm); - kvfree(msq); + kfree(msq); } /** @@ -147,7 +147,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) key_t key = params->key; int msgflg = params->flg; - msq = kvmalloc(sizeof(*msq), GFP_KERNEL); + msq = kmalloc(sizeof(*msq), GFP_KERNEL); if (unlikely(!msq)) return -ENOMEM; @@ -157,7 +157,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) msq->q_perm.security = NULL; retval = security_msg_queue_alloc(&msq->q_perm); if (retval) { - kvfree(msq); + kfree(msq); return retval; } diff --git a/ipc/shm.c b/ipc/shm.c index febd88daba8c..a66b2664558b 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -222,7 +222,7 @@ static void shm_rcu_free(struct rcu_head *head) struct shmid_kernel *shp = container_of(ptr, struct shmid_kernel, shm_perm); security_shm_free(&shp->shm_perm); - kvfree(shp); + kfree(shp); } static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) @@ -619,7 +619,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ns->shm_tot + numpages > ns->shm_ctlall) return -ENOSPC; - shp = kvmalloc(sizeof(*shp), GFP_KERNEL); + shp = kmalloc(sizeof(*shp), GFP_KERNEL); if (unlikely(!shp)) return -ENOMEM; @@ -630,7 +630,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_perm.security = NULL; error = security_shm_alloc(&shp->shm_perm); if (error) { - kvfree(shp); + kfree(shp); return error; } -- cgit v1.2.3 From 17d056e0bdaab3d3f1fbec1ac154addcc4183aed Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Wed, 30 Jun 2021 18:57:15 -0700 Subject: ipc/sem.c: use READ_ONCE()/WRITE_ONCE() for use_global_lock The patch solves three weaknesses in ipc/sem.c: 1) The initial read of use_global_lock in sem_lock() is an intentional race. KCSAN detects these accesses and prints a warning. 2) The code assumes that plain C read/writes are not mangled by the CPU or the compiler. 3) The comment it sysvipc_sem_proc_show() was hard to understand: The rest of the comments in ipc/sem.c speaks about sem_perm.lock, and suddenly this function speaks about ipc_lock_object(). To solve 1) and 2), use READ_ONCE()/WRITE_ONCE(). Plain C reads are used in code that owns sma->sem_perm.lock. The comment is updated to solve 3) [manfred@colorfullife.com: use READ_ONCE()/WRITE_ONCE() for use_global_lock] Link: https://lkml.kernel.org/r/20210627161919.3196-3-manfred@colorfullife.com Link: https://lkml.kernel.org/r/20210514175319.12195-1-manfred@colorfullife.com Signed-off-by: Manfred Spraul Reviewed-by: Paul E. McKenney Reviewed-by: Davidlohr Bueso Cc: <1vier1@web.de> Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'ipc') diff --git a/ipc/sem.c b/ipc/sem.c index 3a58188733d8..971e75d28364 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -217,6 +217,8 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); * this smp_load_acquire(), this is guaranteed because the smp_load_acquire() * is inside a spin_lock() and after a write from 0 to non-zero a * spin_lock()+spin_unlock() is done. + * To prevent the compiler/cpu temporarily writing 0 to use_global_lock, + * READ_ONCE()/WRITE_ONCE() is used. * * 2) queue.status: (SEM_BARRIER_2) * Initialization is done while holding sem_lock(), so no further barrier is @@ -342,10 +344,10 @@ static void complexmode_enter(struct sem_array *sma) * Nothing to do, just reset the * counter until we return to simple mode. */ - sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS; + WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS); return; } - sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS; + WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS); for (i = 0; i < sma->sem_nsems; i++) { sem = &sma->sems[i]; @@ -371,7 +373,8 @@ static void complexmode_tryleave(struct sem_array *sma) /* See SEM_BARRIER_1 for purpose/pairing */ smp_store_release(&sma->use_global_lock, 0); } else { - sma->use_global_lock--; + WRITE_ONCE(sma->use_global_lock, + sma->use_global_lock-1); } } @@ -412,7 +415,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, * Initial check for use_global_lock. Just an optimization, * no locking, no memory barrier. */ - if (!sma->use_global_lock) { + if (!READ_ONCE(sma->use_global_lock)) { /* * It appears that no complex operation is around. * Acquire the per-semaphore lock. @@ -2436,7 +2439,8 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) /* * The proc interface isn't aware of sem_lock(), it calls - * ipc_lock_object() directly (in sysvipc_find_ipc). + * ipc_lock_object(), i.e. spin_lock(&sma->sem_perm.lock). + * (in sysvipc_find_ipc) * In order to stay compatible with sem_lock(), we must * enter / leave complex_mode. */ -- cgit v1.2.3 From b869d5be0acf0e125e69adcffdca04000dc5b17c Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Wed, 30 Jun 2021 18:57:18 -0700 Subject: ipc/util.c: use binary search for max_idx If semctl(), msgctl() and shmctl() are called with IPC_INFO, SEM_INFO, MSG_INFO or SHM_INFO, then the return value is the index of the highest used index in the kernel's internal array recording information about all SysV objects of the requested type for the current namespace. (This information can be used with repeated ..._STAT or ..._STAT_ANY operations to obtain information about all SysV objects on the system.) There is a cache for this value. But when the cache needs up be updated, then the highest used index is determined by looping over all possible values. With the introduction of IPCMNI_EXTEND_SHIFT, this could be a loop over 16 million entries. And due to /proc/sys/kernel/*next_id, the index values do not need to be consecutive. With , msgget(), msgctl(,IPC_RMID) in a loop, I have observed a performance increase of around factor 13000. As there is no get_last() function for idr structures: Implement a "get_last()" using a binary search. As far as I see, ipc is the only user that needs get_last(), thus implement it in ipc/util.c and not in a central location. [akpm@linux-foundation.org: tweak comment, fix typo] Link: https://lkml.kernel.org/r/20210425075208.11777-2-manfred@colorfullife.com Signed-off-by: Manfred Spraul Acked-by: Davidlohr Bueso Cc: <1vier1@web.de> Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.c | 44 +++++++++++++++++++++++++++++++++++++++----- ipc/util.h | 3 +++ 2 files changed, 42 insertions(+), 5 deletions(-) (limited to 'ipc') diff --git a/ipc/util.c b/ipc/util.c index cfa0045e748d..0027e47626b7 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -64,6 +64,7 @@ #include #include #include +#include #include @@ -450,6 +451,41 @@ static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) ipc_kht_params); } +/** + * ipc_search_maxidx - search for the highest assigned index + * @ids: ipc identifier set + * @limit: known upper limit for highest assigned index + * + * The function determines the highest assigned index in @ids. It is intended + * to be called when ids->max_idx needs to be updated. + * Updating ids->max_idx is necessary when the current highest index ipc + * object is deleted. + * If no ipc object is allocated, then -1 is returned. + * + * ipc_ids.rwsem needs to be held by the caller. + */ +static int ipc_search_maxidx(struct ipc_ids *ids, int limit) +{ + int tmpidx; + int i; + int retval; + + i = ilog2(limit+1); + + retval = 0; + for (; i >= 0; i--) { + tmpidx = retval | (1<ipcs_idr, &tmpidx)) + retval |= (1<deleted = true; if (unlikely(idx == ids->max_idx)) { - do { - idx--; - if (idx == -1) - break; - } while (!idr_find(&ids->ipcs_idr, idx)); + idx = ids->max_idx-1; + if (idx >= 0) + idx = ipc_search_maxidx(ids, idx); ids->max_idx = idx; } } diff --git a/ipc/util.h b/ipc/util.h index 5766c61aed0e..2dd7ce0416d8 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -145,6 +145,9 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg); * ipc_get_maxidx - get the highest assigned index * @ids: ipc identifier set * + * The function returns the highest assigned index for @ids. The function + * doesn't scan the idr tree, it uses a cached value. + * * Called with ipc_ids.rwsem held for reading. */ static inline int ipc_get_maxidx(struct ipc_ids *ids) -- cgit v1.2.3