Skip to content

Commit 9ed685a

Browse files
author
CKI KWF Bot
committed
Merge: futex: Support task local hash map futex, FUTEX2_NUMA and FUTEX2_MPOL
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/7085 JIRA: https://issues.redhat.com/browse/RHEL-101190 MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/7085 Tested: The updated futex selftest was run with a patched RHEL9 kernel with no failure. Omitted-fix: 4ec3c15 ("futex: Use correct exit on failure from futex_hash_allocate_default()") Due to missing upstream commit 304b3f2 ("sched: Allow sched_cgroup_fork() to fail and introduce sched_cancel_fork()") which introduces the bad_fork_core_free label, backport of commit 7c4f75a ("futex: Allow automatic allocation of process wide futex hash") has to branch to the bad_fork_cancel_cgroup label instead which turns out to be correct one. The original focus of this MR is to backport support for process local futex hash map to avoid unnecessary lock contentions with other processes that access futexes unrelated to the current process. These lock contentions can potentially introduce additional latency that will break RT applications that have a strict latency limit. As part of the upstream series, this MR will also backport support for new FUTEX2_NUMA and FUTEX2_MPOL futexes which can be useful for some workloads. Additional patches are also pulled in to reduce merge conflicts and context differences. Signed-off-by: Waiman Long <longman@redhat.com> Approved-by: Phil Auld <pauld@redhat.com> Approved-by: Čestmír Kalina <ckalina@redhat.com> Approved-by: Steve Best <sbest@redhat.com> Approved-by: Aristeu Rozanski <arozansk@redhat.com> Approved-by: Herton R. Krzesinski <herton@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: CKI GitLab Kmaint Pipeline Bot <26919896-cki-kmaint-pipeline-bot@users.noreply.gitlab.com>
2 parents aa8f567 + eb21f2a commit 9ed685a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+3462
-767
lines changed

arch/x86/include/asm/futex.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ do { \
4848
static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
4949
u32 __user *uaddr)
5050
{
51-
if (!user_access_begin(uaddr, sizeof(u32)))
51+
if (can_do_masked_user_access())
52+
uaddr = masked_user_access_begin(uaddr);
53+
else if (!user_access_begin(uaddr, sizeof(u32)))
5254
return -EFAULT;
5355

5456
switch (op) {
@@ -84,7 +86,9 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
8486
{
8587
int ret = 0;
8688

87-
if (!user_access_begin(uaddr, sizeof(u32)))
89+
if (can_do_masked_user_access())
90+
uaddr = masked_user_access_begin(uaddr);
91+
else if (!user_access_begin(uaddr, sizeof(u32)))
8892
return -EFAULT;
8993
asm volatile("\n"
9094
"1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"

include/linux/futex.h

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44

55
#include <linux/sched.h>
66
#include <linux/ktime.h>
7+
#include <linux/mm_types.h>
78

89
#include <uapi/linux/futex.h>
910

1011
struct inode;
11-
struct mm_struct;
1212
struct task_struct;
1313

1414
/*
@@ -34,6 +34,7 @@ union futex_key {
3434
u64 i_seq;
3535
unsigned long pgoff;
3636
unsigned int offset;
37+
/* unsigned int node; */
3738
} shared;
3839
struct {
3940
union {
@@ -42,11 +43,13 @@ union futex_key {
4243
};
4344
unsigned long address;
4445
unsigned int offset;
46+
/* unsigned int node; */
4547
} private;
4648
struct {
4749
u64 ptr;
4850
unsigned long word;
4951
unsigned int offset;
52+
unsigned int node; /* NOT hashed! */
5053
} both;
5154
};
5255

@@ -77,7 +80,20 @@ void futex_exec_release(struct task_struct *tsk);
7780

7881
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
7982
u32 __user *uaddr2, u32 val2, u32 val3);
80-
#else
83+
int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4);
84+
85+
#ifdef CONFIG_FUTEX_PRIVATE_HASH
86+
int futex_hash_allocate_default(void);
87+
void futex_hash_free(struct mm_struct *mm);
88+
int futex_mm_init(struct mm_struct *mm);
89+
90+
#else /* !CONFIG_FUTEX_PRIVATE_HASH */
91+
static inline int futex_hash_allocate_default(void) { return 0; }
92+
static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
93+
static inline int futex_mm_init(struct mm_struct *mm) { return 0; }
94+
#endif /* CONFIG_FUTEX_PRIVATE_HASH */
95+
96+
#else /* !CONFIG_FUTEX */
8197
static inline void futex_init_task(struct task_struct *tsk) { }
8298
static inline void futex_exit_recursive(struct task_struct *tsk) { }
8399
static inline void futex_exit_release(struct task_struct *tsk) { }
@@ -88,6 +104,17 @@ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
88104
{
89105
return -EINVAL;
90106
}
107+
static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
108+
{
109+
return -EINVAL;
110+
}
111+
static inline int futex_hash_allocate_default(void)
112+
{
113+
return 0;
114+
}
115+
static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
116+
static inline int futex_mm_init(struct mm_struct *mm) { return 0; }
117+
91118
#endif
92119

93120
#endif

include/linux/mm.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -742,7 +742,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
742742
* we don't rely on for anything - the mm_lock_seq read against which we
743743
* need ordering is below.
744744
*/
745-
if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
745+
if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
746746
return false;
747747

748748
if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
@@ -759,7 +759,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
759759
* after it has been unlocked.
760760
* This pairs with RELEASE semantics in vma_end_write_all().
761761
*/
762-
if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
762+
if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
763763
up_read(&vma->vm_lock->lock);
764764
return false;
765765
}
@@ -774,15 +774,15 @@ static inline void vma_end_read(struct vm_area_struct *vma)
774774
}
775775

776776
/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
777-
static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
777+
static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
778778
{
779779
mmap_assert_write_locked(vma->vm_mm);
780780

781781
/*
782782
* current task is holding mmap_write_lock, both vma->vm_lock_seq and
783783
* mm->mm_lock_seq can't be concurrently modified.
784784
*/
785-
*mm_lock_seq = vma->vm_mm->mm_lock_seq;
785+
*mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
786786
return (vma->vm_lock_seq == *mm_lock_seq);
787787
}
788788

@@ -793,7 +793,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
793793
*/
794794
static inline void vma_start_write(struct vm_area_struct *vma)
795795
{
796-
int mm_lock_seq;
796+
unsigned int mm_lock_seq;
797797

798798
if (__is_vma_write_locked(vma, &mm_lock_seq))
799799
return;
@@ -811,7 +811,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
811811

812812
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
813813
{
814-
int mm_lock_seq;
814+
unsigned int mm_lock_seq;
815815

816816
VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
817817
}

include/linux/mm_types.h

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#define INIT_PASID 0
3333

3434
struct address_space;
35+
struct futex_private_hash;
3536
struct mem_cgroup;
3637

3738
/*
@@ -617,6 +618,9 @@ struct vma_numab_state {
617618
* per VM-area/task. A VM area is any part of the process virtual memory
618619
* space that has a special rule for the page-fault handlers (ie a shared
619620
* library, the executable area etc).
621+
*
622+
* Only explicitly marked struct members may be accessed by RCU readers before
623+
* getting a stable reference.
620624
*/
621625
struct vm_area_struct {
622626
/* The first cache line has the info for VMA tree walking. */
@@ -632,7 +636,11 @@ struct vm_area_struct {
632636
#endif
633637
};
634638

635-
struct mm_struct *vm_mm; /* The address space we belong to. */
639+
/*
640+
* The address space we belong to.
641+
* Unstable RCU readers are allowed to read this.
642+
*/
643+
struct mm_struct *vm_mm;
636644
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
637645

638646
/*
@@ -645,6 +653,12 @@ struct vm_area_struct {
645653
};
646654

647655
#ifdef CONFIG_PER_VMA_LOCK
656+
/*
657+
* Flag to indicate areas detached from the mm->mm_mt tree.
658+
* Unstable RCU readers are allowed to read this.
659+
*/
660+
bool detached;
661+
648662
/*
649663
* Can only be written (using WRITE_ONCE()) while holding both:
650664
* - mmap_lock (in write mode)
@@ -659,11 +673,9 @@ struct vm_area_struct {
659673
* counter reuse can only lead to occasional unnecessary use of the
660674
* slowpath.
661675
*/
662-
int vm_lock_seq;
676+
unsigned int vm_lock_seq;
677+
/* Unstable RCU readers are allowed to read this. */
663678
struct vma_lock *vm_lock;
664-
665-
/* Flag to indicate areas detached from the mm->mm_mt tree */
666-
bool detached;
667679
#endif
668680

669681
/*
@@ -808,6 +820,9 @@ struct mm_struct {
808820
* Roughly speaking, incrementing the sequence number is
809821
* equivalent to releasing locks on VMAs; reading the sequence
810822
* number can be part of taking a read lock on a VMA.
823+
* Incremented every time mmap_lock is write-locked/unlocked.
824+
* Initialized to 0, therefore odd values indicate mmap_lock
825+
* is write-locked and even values that it's released.
811826
*
812827
* Can be modified under write mmap_lock using RELEASE
813828
* semantics.
@@ -816,9 +831,18 @@ struct mm_struct {
816831
* Can be read with ACQUIRE semantics if not holding write
817832
* mmap_lock.
818833
*/
819-
int mm_lock_seq;
834+
seqcount_t mm_lock_seq;
835+
#endif
836+
#ifdef CONFIG_FUTEX_PRIVATE_HASH
837+
struct mutex futex_hash_lock;
838+
struct futex_private_hash __rcu *futex_phash;
839+
struct futex_private_hash *futex_phash_new;
840+
/* futex-ref */
841+
unsigned long futex_batches;
842+
struct rcu_head futex_rcu;
843+
atomic_long_t futex_atomic;
844+
unsigned int __percpu *futex_ref;
820845
#endif
821-
822846

823847
unsigned long hiwater_rss; /* High-watermark of RSS usage */
824848
unsigned long hiwater_vm; /* High-water virtual memory usage */

include/linux/mmap_lock.h

Lines changed: 69 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <linux/rwsem.h>
88
#include <linux/tracepoint-defs.h>
99
#include <linux/types.h>
10+
#include <linux/cleanup.h>
1011

1112
#define MMAP_LOCK_INITIALIZER(name) \
1213
.mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
@@ -73,46 +74,76 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
7374
}
7475

7576
#ifdef CONFIG_PER_VMA_LOCK
76-
/*
77-
* Drop all currently-held per-VMA locks.
78-
* This is called from the mmap_lock implementation directly before releasing
79-
* a write-locked mmap_lock (or downgrading it to read-locked).
80-
* This should normally NOT be called manually from other places.
81-
* If you want to call this manually anyway, keep in mind that this will release
82-
* *all* VMA write locks, including ones from further up the stack.
83-
*/
84-
static inline void vma_end_write_all(struct mm_struct *mm)
77+
78+
static inline void mm_lock_seqcount_init(struct mm_struct *mm)
79+
{
80+
seqcount_init(&mm->mm_lock_seq);
81+
}
82+
83+
static inline void mm_lock_seqcount_begin(struct mm_struct *mm)
84+
{
85+
do_raw_write_seqcount_begin(&mm->mm_lock_seq);
86+
}
87+
88+
static inline void mm_lock_seqcount_end(struct mm_struct *mm)
89+
{
90+
ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq);
91+
do_raw_write_seqcount_end(&mm->mm_lock_seq);
92+
}
93+
94+
static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
8595
{
86-
mmap_assert_write_locked(mm);
8796
/*
88-
* Nobody can concurrently modify mm->mm_lock_seq due to exclusive
89-
* mmap_lock being held.
90-
* We need RELEASE semantics here to ensure that preceding stores into
91-
* the VMA take effect before we unlock it with this store.
92-
* Pairs with ACQUIRE semantics in vma_start_read().
97+
* Since mmap_lock is a sleeping lock, and waiting for it to become
98+
* unlocked is more or less equivalent with taking it ourselves, don't
99+
* bother with the speculative path if mmap_lock is already write-locked
100+
* and take the slow path, which takes the lock.
93101
*/
94-
smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
102+
return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq);
103+
}
104+
105+
static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
106+
{
107+
return read_seqcount_retry(&mm->mm_lock_seq, seq);
95108
}
96-
#else
97-
static inline void vma_end_write_all(struct mm_struct *mm) {}
98-
#endif
109+
110+
#else /* CONFIG_PER_VMA_LOCK */
111+
112+
static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
113+
static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {}
114+
static inline void mm_lock_seqcount_end(struct mm_struct *mm) {}
115+
116+
static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
117+
{
118+
return false;
119+
}
120+
121+
static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
122+
{
123+
return true;
124+
}
125+
126+
#endif /* CONFIG_PER_VMA_LOCK */
99127

100128
static inline void mmap_init_lock(struct mm_struct *mm)
101129
{
102130
init_rwsem(&mm->mmap_lock);
131+
mm_lock_seqcount_init(mm);
103132
}
104133

105134
static inline void mmap_write_lock(struct mm_struct *mm)
106135
{
107136
__mmap_lock_trace_start_locking(mm, true);
108137
down_write(&mm->mmap_lock);
138+
mm_lock_seqcount_begin(mm);
109139
__mmap_lock_trace_acquire_returned(mm, true, true);
110140
}
111141

112142
static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
113143
{
114144
__mmap_lock_trace_start_locking(mm, true);
115145
down_write_nested(&mm->mmap_lock, subclass);
146+
mm_lock_seqcount_begin(mm);
116147
__mmap_lock_trace_acquire_returned(mm, true, true);
117148
}
118149

@@ -122,10 +153,26 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm)
122153

123154
__mmap_lock_trace_start_locking(mm, true);
124155
ret = down_write_killable(&mm->mmap_lock);
156+
if (!ret)
157+
mm_lock_seqcount_begin(mm);
125158
__mmap_lock_trace_acquire_returned(mm, true, ret == 0);
126159
return ret;
127160
}
128161

162+
/*
163+
* Drop all currently-held per-VMA locks.
164+
* This is called from the mmap_lock implementation directly before releasing
165+
* a write-locked mmap_lock (or downgrading it to read-locked).
166+
* This should normally NOT be called manually from other places.
167+
* If you want to call this manually anyway, keep in mind that this will release
168+
* *all* VMA write locks, including ones from further up the stack.
169+
*/
170+
static inline void vma_end_write_all(struct mm_struct *mm)
171+
{
172+
mmap_assert_write_locked(mm);
173+
mm_lock_seqcount_end(mm);
174+
}
175+
129176
static inline void mmap_write_unlock(struct mm_struct *mm)
130177
{
131178
__mmap_lock_trace_released(mm, true);
@@ -173,6 +220,9 @@ static inline void mmap_read_unlock(struct mm_struct *mm)
173220
up_read(&mm->mmap_lock);
174221
}
175222

223+
DEFINE_GUARD(mmap_read_lock, struct mm_struct *,
224+
mmap_read_lock(_T), mmap_read_unlock(_T))
225+
176226
static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
177227
{
178228
__mmap_lock_trace_released(mm, false);

0 commit comments

Comments
 (0)