Hi Alexander, A few more remarks. On 7/14/21 7:30 PM, Alexander Mikhalitsyn wrote: > This is total rework of fix. > Thanks to Eric Biederman for suggestions (but may be I've misunderstood some of them :)) > > I've tested it with reproducer of the original problem. But of course it needs > detailed testing. I hope that I get some general comments about design and implementation. > > ToDo: remove unneeded "ns" argument from shm_destroy, shm_rmid and other functions. What ensures the that shp->ns is not destroyed prematurely? I did some tests, and it seems that shmat() acquires a namespace refcount, and shm_release() puts it again, and the shm_release is late enough to ensure that the ns cannot go out of scope. But I haven't checked all combinations (with/without shm_rmid_forced, delete via exit(), shmctl(), shmdt(), mmap()). And: This should be documented somewhere. > --- a/ipc/shm.c > +++ b/ipc/shm.c > @@ -65,6 +65,7 @@ struct shmid_kernel /* private to the kernel */ > /* The task created the shm object. NULL if the task is dead. */ > struct task_struct *shm_creator; > struct list_head shm_clist; /* list by creator */ I think the comments are wrong/outdated. Some parts of the new code checks with list_empty(shm_clist), not by looking at shm_creator. > + struct ipc_namespace *ns; > } __randomize_layout; > > /* shm_mode upper byte flags */ > @@ -115,6 +116,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) > struct shmid_kernel *shp; > > shp = container_of(ipcp, struct shmid_kernel, shm_perm); > + BUG_ON(shp->ns && ns != shp->ns); Is shp->ns == NULL allowed/possible? From what I see, it is impossible. I think we should not have  NULL check in a few codepaths, but not in other codepaths. Either everywhere, or nowhere. > > if (shp->shm_nattch) { > shp->shm_perm.mode |= SHM_DEST; > @@ -225,9 +227,32 @@ static void shm_rcu_free(struct rcu_head *head) > kfree(shp); > } > > +static inline void task_shm_clist_lock(struct task_struct *task) > +{ > + spin_lock(&task->sysvshm.shm_clist_lock); > +} > + > +static inline void task_shm_clist_unlock(struct task_struct *task) > +{ > + spin_unlock(&task->sysvshm.shm_clist_lock); > +} > + > +void shm_clist_rm(struct shmid_kernel *shp) > +{ > + if (!shp->shm_creator) > + return; > + > + task_shm_clist_lock(shp->shm_creator); > + list_del_init(&shp->shm_clist); > + task_shm_clist_unlock(shp->shm_creator); > + shp->shm_creator = NULL; > +} > + > static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) > { > - list_del(&s->shm_clist); > + WARN_ON(s->ns && ns != s->ns); > + //list_del_init(&s->shm_clist); > + shm_clist_rm(s); > ipc_rmid(&shm_ids(ns), &s->shm_perm); > } > > @@ -306,10 +331,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) > * > * 2) sysctl kernel.shm_rmid_forced is set to 1. > */ > -static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) > +static bool shm_may_destroy(struct shmid_kernel *shp) > { > return (shp->shm_nattch == 0) && > - (ns->shm_rmid_forced || > + (shp->ns->shm_rmid_forced || > (shp->shm_perm.mode & SHM_DEST)); > } > As written before: what ensures that shp->ns->shm_rmid_forced was not released already? > @@ -340,7 +365,7 @@ static void shm_close(struct vm_area_struct *vma) > ipc_update_pid(&shp->shm_lprid, task_tgid(current)); > shp->shm_dtim = ktime_get_real_seconds(); > shp->shm_nattch--; > - if (shm_may_destroy(ns, shp)) > + if (shm_may_destroy(shp)) > shm_destroy(ns, shp); > else > shm_unlock(shp); > @@ -361,10 +386,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) > * > * As shp->* are changed under rwsem, it's safe to skip shp locking. > */ > - if (shp->shm_creator != NULL) > + if (!list_empty(&shp->shm_clist)) > return 0; > This collides with the comment above: here, list_empty() is used. > - if (shm_may_destroy(ns, shp)) { > + if (shm_may_destroy(shp)) { > shm_lock_by_ptr(shp); > shm_destroy(ns, shp); > } > @@ -379,51 +404,77 @@ void shm_destroy_orphaned(struct ipc_namespace *ns) > up_write(&shm_ids(ns).rwsem); > } > > +void shm_init_task(struct task_struct *task) > +{ > + INIT_LIST_HEAD(&(task)->sysvshm.shm_clist); > + spin_lock_init(&task->sysvshm.shm_clist_lock); > +} > + > /* Locking assumes this will only be called with task == current */ > void exit_shm(struct task_struct *task) > { > - struct ipc_namespace *ns = task->nsproxy->ipc_ns; > + LIST_HEAD(tmp); > struct shmid_kernel *shp, *n; > > if (list_empty(&task->sysvshm.shm_clist)) > return; > > - /* > - * If kernel.shm_rmid_forced is not set then only keep track of > - * which shmids are orphaned, so that a later set of the sysctl > - * can clean them up. > - */ > - if (!ns->shm_rmid_forced) { > - down_read(&shm_ids(ns).rwsem); > - list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist) > - shp->shm_creator = NULL; > - /* > - * Only under read lock but we are only called on current > - * so no entry on the list will be shared. > - */ > - list_del(&task->sysvshm.shm_clist); > - up_read(&shm_ids(ns).rwsem); > - return; > - } > + rcu_read_lock(); /* for refcount_inc_not_zero */ > + task_shm_clist_lock(task); > > - /* > - * Destroy all already created segments, that were not yet mapped, > - * and mark any mapped as orphan to cover the sysctl toggling. > - * Destroy is skipped if shm_may_destroy() returns false. > - */ > - down_write(&shm_ids(ns).rwsem); > list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { > + struct ipc_namespace *ns = shp->ns; > + > + /* > + * Remove shm from task list and nullify shm_creator which > + * marks object as orphaned. > + * > + * If kernel.shm_rmid_forced is not set then only keep track of > + * which shmids are orphaned, so that a later set of the sysctl > + * can clean them up. > + */ > + list_del_init(&shp->shm_clist); > shp->shm_creator = NULL; > > - if (shm_may_destroy(ns, shp)) { > - shm_lock_by_ptr(shp); > - shm_destroy(ns, shp); > + printk("exit_shm() %px refcnt=%u, id=%d,key=%x\n", shp, > + refcount_read(&shp->shm_perm.refcount), > + shp->shm_perm.id, shp->shm_perm.key > + ); > + > + /* > + * Will destroy all already created segments, that were not yet mapped, > + * and mark any mapped as orphan to cover the sysctl toggling. > + * Destroy is skipped if shm_may_destroy() returns false. > + */ > + if (shp->ns->shm_rmid_forced && shm_may_destroy(shp)) { > + /* > + * We may race with shm_exit_ns() if refcounter > + * already zero. Let's skip shm_destroy() of such > + * shm object as it will be destroyed during shm_exit_ns() > + */ > + if (!refcount_inc_not_zero(&ns->ns.count)) /* get_ipc_ns */ > + continue; > + > + list_add(&shp->shm_clist, &tmp); > } > } > > - /* Remove the list head from any segments still attached. */ > list_del(&task->sysvshm.shm_clist); > - up_write(&shm_ids(ns).rwsem); > + task_shm_clist_unlock(task); > + rcu_read_unlock(); > + > + list_for_each_entry_safe(shp, n, &tmp, shm_clist) { > + struct ipc_namespace *ns = shp->ns; > + > + list_del_init(&shp->shm_clist); > + > + down_write(&shm_ids(ns).rwsem); > + shm_lock_by_ptr(shp); > + /* will do put_ipc_ns(shp->ns) */ > + shm_destroy(ns, shp); > + up_write(&shm_ids(ns).rwsem); > + put_ipc_ns(ns); /* see refcount_inc_not_zero */ > + } > } > I do not see the advantage of first collecting everything in a local list, and then destroying the elements. Attached is my current test case. Feel free to merge whatever you consider as useful into your change. --     Manfred