From 4a6780a30e86cde7756954981db9e6aec285793d Mon Sep 17 00:00:00 2001 From: Haorui He Date: Sun, 12 Jan 2025 22:49:20 +0800 Subject: [PATCH 001/934] cgroup: update comment about dropping cgroup kn refs the cgroup is actually freed in css_free_rwork_fn() now the ref count of the cgroup's kernfs_node is also dropped there so we need to update the corresponding comment in cgroup_mkdir() Signed-off-by: Haorui He Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d9061bd55436b..805764cf14e2f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5835,7 +5835,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) } /* - * This extra ref will be put in cgroup_free_fn() and guarantees + * This extra ref will be put in css_free_rwork_fn() and guarantees * that @cgrp->kn is always accessible. */ kernfs_get(cgrp->kn); -- GitLab From dae68fba8e115fd84d820354f79da1481135acbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 20 Jan 2025 15:57:49 +0100 Subject: [PATCH 002/934] cgroup/cpuset: Move procfs cpuset attribute under cgroup-v1.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpuset file is a legacy attribute that is bound primarily to cpuset v1 hierarchy (equivalent information is available in /proc/$pid/cgroup path on the unified hierarchy in conjunction with respective cgroup.controllers showing where cpuset controller is enabled). Followup to commit b0ced9d378d49 ("cgroup/cpuset: move v1 interfaces to cpuset-v1.c") and hide CONFIG_PROC_PID_CPUSET under CONFIG_CPUSETS_V1. Drop an obsolete comment too. Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- init/Kconfig | 5 +++-- kernel/cgroup/cpuset-v1.c | 41 +++++++++++++++++++++++++++++++++++ kernel/cgroup/cpuset.c | 45 --------------------------------------- 3 files changed, 44 insertions(+), 47 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index a20e6efd3f0fb..2f3121c49ed23 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1182,7 +1182,8 @@ config CPUSETS_V1 help Legacy cgroup v1 cpusets controller which has been deprecated by cgroup v2 implementation. The v1 is there for legacy applications - which haven't migrated to the new cgroup v2 interface yet. If you + which haven't migrated to the new cgroup v2 interface yet. Legacy + interface includes cpuset filesystem and /proc//cpuset. If you do not have any such application then you are completely fine leaving this option disabled. @@ -1190,7 +1191,7 @@ config CPUSETS_V1 config PROC_PID_CPUSET bool "Include legacy /proc//cpuset file" - depends on CPUSETS + depends on CPUSETS_V1 default y config CGROUP_DEVICE diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 25c1d7b77e2f2..81b5e2a50d587 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include "cgroup-internal.h" #include "cpuset-internal.h" /* @@ -373,6 +374,46 @@ out: return ret; } +#ifdef CONFIG_PROC_PID_CPUSET +/* + * proc_cpuset_show() + * - Print tasks cpuset path into seq_file. + * - Used for /proc//cpuset. + */ +int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) +{ + char *buf; + struct cgroup_subsys_state *css; + int retval; + + retval = -ENOMEM; + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + goto out; + + rcu_read_lock(); + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); + + if (retval == -E2BIG) + retval = -ENAMETOOLONG; + if (retval < 0) + goto out_free; + seq_puts(m, buf); + seq_putc(m, '\n'); + retval = 0; +out_free: + kfree(buf); +out: + return retval; +} +#endif /* CONFIG_PROC_PID_CPUSET */ + static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct cpuset *cs = css_cs(css); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0f910c828973a..5a637292faa20 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,7 +21,6 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ -#include "cgroup-internal.h" #include "cpuset-internal.h" #include @@ -4244,50 +4243,6 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } -#ifdef CONFIG_PROC_PID_CPUSET -/* - * proc_cpuset_show() - * - Print tasks cpuset path into seq_file. - * - Used for /proc//cpuset. - * - No need to task_lock(tsk) on this tsk->cpuset reference, as it - * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_mutex, keeping cpuset_attach() from changing it - * anyway. - */ -int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - char *buf; - struct cgroup_subsys_state *css; - int retval; - - retval = -ENOMEM; - buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!buf) - goto out; - - rcu_read_lock(); - spin_lock_irq(&css_set_lock); - css = task_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - spin_unlock_irq(&css_set_lock); - rcu_read_unlock(); - - if (retval == -E2BIG) - retval = -ENAMETOOLONG; - if (retval < 0) - goto out_free; - seq_puts(m, buf); - seq_putc(m, '\n'); - retval = 0; -out_free: - kfree(buf); -out: - return retval; -} -#endif /* CONFIG_PROC_PID_CPUSET */ - /* Display task mems_allowed in /proc//status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { -- GitLab From 337d1b354a297155579dea970fff9dd10ed32f77 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 27 Jan 2025 23:27:21 +0100 Subject: [PATCH 003/934] sched_ext: Move built-in idle CPU selection policy to a separate file As ext.c is becoming quite large, move the idle CPU selection policy to separate files (ext_idle.c / ext_idle.h) for better code readability. Moreover, group together all the idle CPU selection kfunc's to the same btf_kfunc_id_set block. No functional changes, this is purely code reorganization. Suggested-by: Yury Norov Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- MAINTAINERS | 3 +- kernel/sched/build_policy.c | 1 + kernel/sched/ext.c | 739 +---------------------------------- kernel/sched/ext_idle.c | 752 ++++++++++++++++++++++++++++++++++++ kernel/sched/ext_idle.h | 39 ++ 5 files changed, 808 insertions(+), 726 deletions(-) create mode 100644 kernel/sched/ext_idle.c create mode 100644 kernel/sched/ext_idle.h diff --git a/MAINTAINERS b/MAINTAINERS index 4e3a9c3fcb8cf..021ef04afa00b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21006,8 +21006,7 @@ S: Maintained W: https://github.com/sched-ext/scx T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git F: include/linux/sched/ext.h -F: kernel/sched/ext.h -F: kernel/sched/ext.c +F: kernel/sched/ext* F: tools/sched_ext/ F: tools/testing/selftests/sched_ext diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index fae1f5c921eb3..72d97aa8b7260 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -61,6 +61,7 @@ #ifdef CONFIG_SCHED_CLASS_EXT # include "ext.c" +# include "ext_idle.c" #endif #include "syscalls.c" diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index a6d6d6dadde51..5f6a425d4ffea 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6,6 +6,9 @@ * Copyright (c) 2022 Tejun Heo * Copyright (c) 2022 David Vernet */ +#include +#include "ext_idle.h" + #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) enum scx_consts { @@ -883,12 +886,6 @@ static bool scx_warned_zero_slice; static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); -static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); - -#ifdef CONFIG_SMP -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); -#endif static struct static_key_false scx_has_op[SCX_OPI_END] = { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; @@ -923,21 +920,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; static struct delayed_work scx_watchdog_work; -/* idle tracking */ -#ifdef CONFIG_SMP -#ifdef CONFIG_CPUMASK_OFFSTACK -#define CL_ALIGNED_IF_ONSTACK -#else -#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp -#endif - -static struct { - cpumask_var_t cpu; - cpumask_var_t smt; -} idle_masks CL_ALIGNED_IF_ONSTACK; - -#endif /* CONFIG_SMP */ - /* for %SCX_KICK_WAIT */ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; @@ -3175,416 +3157,6 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, #ifdef CONFIG_SMP -static bool test_and_clear_cpu_idle(int cpu) -{ -#ifdef CONFIG_SCHED_SMT - /* - * SMT mask should be cleared whether we can claim @cpu or not. The SMT - * cluster is not wholly idle either way. This also prevents - * scx_pick_idle_cpu() from getting caught in an infinite loop. - */ - if (sched_smt_active()) { - const struct cpumask *smt = cpu_smt_mask(cpu); - - /* - * If offline, @cpu is not its own sibling and - * scx_pick_idle_cpu() can get caught in an infinite loop as - * @cpu is never cleared from idle_masks.smt. Ensure that @cpu - * is eventually cleared. - * - * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to - * reduce memory writes, which may help alleviate cache - * coherence pressure. - */ - if (cpumask_intersects(smt, idle_masks.smt)) - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); - else if (cpumask_test_cpu(cpu, idle_masks.smt)) - __cpumask_clear_cpu(cpu, idle_masks.smt); - } -#endif - return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); -} - -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) -{ - int cpu; - -retry: - if (sched_smt_active()) { - cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); - if (cpu < nr_cpu_ids) - goto found; - - if (flags & SCX_PICK_IDLE_CORE) - return -EBUSY; - } - - cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); - if (cpu >= nr_cpu_ids) - return -EBUSY; - -found: - if (test_and_clear_cpu_idle(cpu)) - return cpu; - else - goto retry; -} - -/* - * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC - * domain is not defined). - */ -static unsigned int llc_weight(s32 cpu) -{ - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (!sd) - return 0; - - return sd->span_weight; -} - -/* - * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC - * domain is not defined). - */ -static struct cpumask *llc_span(s32 cpu) -{ - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (!sd) - return 0; - - return sched_domain_span(sd); -} - -/* - * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the - * NUMA domain is not defined). - */ -static unsigned int numa_weight(s32 cpu) -{ - struct sched_domain *sd; - struct sched_group *sg; - - sd = rcu_dereference(per_cpu(sd_numa, cpu)); - if (!sd) - return 0; - sg = sd->groups; - if (!sg) - return 0; - - return sg->group_weight; -} - -/* - * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA - * domain is not defined). - */ -static struct cpumask *numa_span(s32 cpu) -{ - struct sched_domain *sd; - struct sched_group *sg; - - sd = rcu_dereference(per_cpu(sd_numa, cpu)); - if (!sd) - return NULL; - sg = sd->groups; - if (!sg) - return NULL; - - return sched_group_span(sg); -} - -/* - * Return true if the LLC domains do not perfectly overlap with the NUMA - * domains, false otherwise. - */ -static bool llc_numa_mismatch(void) -{ - int cpu; - - /* - * We need to scan all online CPUs to verify whether their scheduling - * domains overlap. - * - * While it is rare to encounter architectures with asymmetric NUMA - * topologies, CPU hotplugging or virtualized environments can result - * in asymmetric configurations. - * - * For example: - * - * NUMA 0: - * - LLC 0: cpu0..cpu7 - * - LLC 1: cpu8..cpu15 [offline] - * - * NUMA 1: - * - LLC 0: cpu16..cpu23 - * - LLC 1: cpu24..cpu31 - * - * In this case, if we only check the first online CPU (cpu0), we might - * incorrectly assume that the LLC and NUMA domains are fully - * overlapping, which is incorrect (as NUMA 1 has two distinct LLC - * domains). - */ - for_each_online_cpu(cpu) - if (llc_weight(cpu) != numa_weight(cpu)) - return true; - - return false; -} - -/* - * Initialize topology-aware scheduling. - * - * Detect if the system has multiple LLC or multiple NUMA domains and enable - * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle - * selection policy. - * - * Assumption: the kernel's internal topology representation assumes that each - * CPU belongs to a single LLC domain, and that each LLC domain is entirely - * contained within a single NUMA node. - */ -static void update_selcpu_topology(void) -{ - bool enable_llc = false, enable_numa = false; - unsigned int nr_cpus; - s32 cpu = cpumask_first(cpu_online_mask); - - /* - * Enable LLC domain optimization only when there are multiple LLC - * domains among the online CPUs. If all online CPUs are part of a - * single LLC domain, the idle CPU selection logic can choose any - * online CPU without bias. - * - * Note that it is sufficient to check the LLC domain of the first - * online CPU to determine whether a single LLC domain includes all - * CPUs. - */ - rcu_read_lock(); - nr_cpus = llc_weight(cpu); - if (nr_cpus > 0) { - if (nr_cpus < num_online_cpus()) - enable_llc = true; - pr_debug("sched_ext: LLC=%*pb weight=%u\n", - cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); - } - - /* - * Enable NUMA optimization only when there are multiple NUMA domains - * among the online CPUs and the NUMA domains don't perfectly overlaps - * with the LLC domains. - * - * If all CPUs belong to the same NUMA node and the same LLC domain, - * enabling both NUMA and LLC optimizations is unnecessary, as checking - * for an idle CPU in the same domain twice is redundant. - */ - nr_cpus = numa_weight(cpu); - if (nr_cpus > 0) { - if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) - enable_numa = true; - pr_debug("sched_ext: NUMA=%*pb weight=%u\n", - cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); - } - rcu_read_unlock(); - - pr_debug("sched_ext: LLC idle selection %s\n", - str_enabled_disabled(enable_llc)); - pr_debug("sched_ext: NUMA idle selection %s\n", - str_enabled_disabled(enable_numa)); - - if (enable_llc) - static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); - else - static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); - if (enable_numa) - static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); - else - static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); -} - -/* - * Built-in CPU idle selection policy: - * - * 1. Prioritize full-idle cores: - * - always prioritize CPUs from fully idle cores (both logical CPUs are - * idle) to avoid interference caused by SMT. - * - * 2. Reuse the same CPU: - * - prefer the last used CPU to take advantage of cached data (L1, L2) and - * branch prediction optimizations. - * - * 3. Pick a CPU within the same LLC (Last-Level Cache): - * - if the above conditions aren't met, pick a CPU that shares the same LLC - * to maintain cache locality. - * - * 4. Pick a CPU within the same NUMA node, if enabled: - * - choose a CPU from the same NUMA node to reduce memory access latency. - * - * 5. Pick any idle CPU usable by the task. - * - * Step 3 and 4 are performed only if the system has, respectively, multiple - * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and - * scx_selcpu_topo_numa). - * - * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because - * we never call ops.select_cpu() for them, see select_task_rq(). - */ -static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *found) -{ - const struct cpumask *llc_cpus = NULL; - const struct cpumask *numa_cpus = NULL; - s32 cpu; - - *found = false; - - /* - * This is necessary to protect llc_cpus. - */ - rcu_read_lock(); - - /* - * Determine the scheduling domain only if the task is allowed to run - * on all CPUs. - * - * This is done primarily for efficiency, as it avoids the overhead of - * updating a cpumask every time we need to select an idle CPU (which - * can be costly in large SMP systems), but it also aligns logically: - * if a task's scheduling domain is restricted by user-space (through - * CPU affinity), the task will simply use the flat scheduling domain - * defined by user-space. - */ - if (p->nr_cpus_allowed >= num_possible_cpus()) { - if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) - numa_cpus = numa_span(prev_cpu); - - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) - llc_cpus = llc_span(prev_cpu); - } - - /* - * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. - */ - if (wake_flags & SCX_WAKE_SYNC) { - cpu = smp_processor_id(); - - /* - * If the waker's CPU is cache affine and prev_cpu is idle, - * then avoid a migration. - */ - if (cpus_share_cache(cpu, prev_cpu) && - test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * If the waker's local DSQ is empty, and the system is under - * utilized, try to wake up @p to the local DSQ of the waker. - * - * Checking only for an empty local DSQ is insufficient as it - * could give the wakee an unfair advantage when the system is - * oversaturated. - * - * Checking only for the presence of idle CPUs is also - * insufficient as the local DSQ of the waker could have tasks - * piled up on it even if there is an idle core elsewhere on - * the system. - */ - if (!cpumask_empty(idle_masks.cpu) && - !(current->flags & PF_EXITING) && - cpu_rq(cpu)->scx.local_dsq.nr == 0) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) - goto cpu_found; - } - } - - /* - * If CPU has SMT, any wholly idle CPU is likely a better pick than - * partially idle @prev_cpu. - */ - if (sched_smt_active()) { - /* - * Keep using @prev_cpu if it's part of a fully idle core. - */ - if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && - test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * Search for any fully idle core in the same LLC domain. - */ - if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any fully idle core in the same NUMA node. - */ - if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any full idle core usable by the task. - */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Use @prev_cpu if it's idle. - */ - if (test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * Search for any idle CPU in the same LLC domain. - */ - if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, 0); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any idle CPU in the same NUMA node. - */ - if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, 0); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any idle CPU usable by the task. - */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); - if (cpu >= 0) - goto cpu_found; - - rcu_read_unlock(); - return prev_cpu; - -cpu_found: - rcu_read_unlock(); - - *found = true; - return cpu; -} - static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { /* @@ -3651,90 +3223,6 @@ static void set_cpus_allowed_scx(struct task_struct *p, (struct cpumask *)p->cpus_ptr); } -static void reset_idle_masks(void) -{ - /* - * Consider all online cpus idle. Should converge to the actual state - * quickly. - */ - cpumask_copy(idle_masks.cpu, cpu_online_mask); - cpumask_copy(idle_masks.smt, cpu_online_mask); -} - -static void update_builtin_idle(int cpu, bool idle) -{ - assign_cpu(cpu, idle_masks.cpu, idle); - -#ifdef CONFIG_SCHED_SMT - if (sched_smt_active()) { - const struct cpumask *smt = cpu_smt_mask(cpu); - - if (idle) { - /* - * idle_masks.smt handling is racy but that's fine as - * it's only for optimization and self-correcting. - */ - if (!cpumask_subset(smt, idle_masks.cpu)) - return; - cpumask_or(idle_masks.smt, idle_masks.smt, smt); - } else { - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); - } - } -#endif -} - -/* - * Update the idle state of a CPU to @idle. - * - * If @do_notify is true, ops.update_idle() is invoked to notify the scx - * scheduler of an actual idle state transition (idle to busy or vice - * versa). If @do_notify is false, only the idle state in the idle masks is - * refreshed without invoking ops.update_idle(). - * - * This distinction is necessary, because an idle CPU can be "reserved" and - * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as - * busy even if no tasks are dispatched. In this case, the CPU may return - * to idle without a true state transition. Refreshing the idle masks - * without invoking ops.update_idle() ensures accurate idle state tracking - * while avoiding unnecessary updates and maintaining balanced state - * transitions. - */ -void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) -{ - int cpu = cpu_of(rq); - - lockdep_assert_rq_held(rq); - - /* - * Trigger ops.update_idle() only when transitioning from a task to - * the idle thread and vice versa. - * - * Idle transitions are indicated by do_notify being set to true, - * managed by put_prev_task_idle()/set_next_task_idle(). - */ - if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); - - /* - * Update the idle masks: - * - for real idle transitions (do_notify == true) - * - for idle-to-idle transitions (indicated by the previous task - * being the idle thread, managed by pick_task_idle()) - * - * Skip updating idle masks if the previous task is not the idle - * thread, since set_next_task_idle() has already handled it when - * transitioning from a task to the idle thread (calling this - * function with do_notify == true). - * - * In this way we can avoid updating the idle masks twice, - * unnecessarily. - */ - if (static_branch_likely(&scx_builtin_idle_enabled)) - if (do_notify || is_idle_task(rq->curr)) - update_builtin_idle(cpu, idle); -} - static void handle_hotplug(struct rq *rq, bool online) { int cpu = cpu_of(rq); @@ -3742,7 +3230,7 @@ static void handle_hotplug(struct rq *rq, bool online) atomic_long_inc(&scx_hotplug_seq); if (scx_enabled()) - update_selcpu_topology(); + scx_idle_update_selcpu_topology(); if (online && SCX_HAS_OP(cpu_online)) SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); @@ -3774,12 +3262,6 @@ static void rq_offline_scx(struct rq *rq) rq->scx.flags &= ~SCX_RQ_ONLINE; } -#else /* CONFIG_SMP */ - -static bool test_and_clear_cpu_idle(int cpu) { return false; } -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } -static void reset_idle_masks(void) {} - #endif /* CONFIG_SMP */ static bool check_rq_for_timeouts(struct rq *rq) @@ -5615,9 +5097,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) static_branch_enable_cpuslocked(&scx_has_op[i]); check_hotplug_seq(ops); -#ifdef CONFIG_SMP - update_selcpu_topology(); -#endif + scx_idle_update_selcpu_topology(); + cpus_read_unlock(); ret = validate_ops(ops); @@ -5665,7 +5146,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) static_branch_enable(&scx_ops_cpu_preempt); if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { - reset_idle_masks(); + scx_idle_reset_masks(); static_branch_enable(&scx_builtin_idle_enabled); } else { static_branch_disable(&scx_builtin_idle_enabled); @@ -6308,10 +5789,8 @@ void __init init_sched_ext_class(void) SCX_TG_ONLINE); BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); -#ifdef CONFIG_SMP - BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); -#endif + scx_idle_init_masks(); + scx_kick_cpus_pnt_seqs = __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, __alignof__(scx_kick_cpus_pnt_seqs[0])); @@ -6344,62 +5823,6 @@ void __init init_sched_ext_class(void) /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -#include - -__bpf_kfunc_start_defs(); - -static bool check_builtin_idle_enabled(void) -{ - if (static_branch_likely(&scx_builtin_idle_enabled)) - return true; - - scx_ops_error("built-in idle tracking is disabled"); - return false; -} - -/** - * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() - * @p: task_struct to select a CPU for - * @prev_cpu: CPU @p was on previously - * @wake_flags: %SCX_WAKE_* flags - * @is_idle: out parameter indicating whether the returned CPU is idle - * - * Can only be called from ops.select_cpu() if the built-in CPU selection is - * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). - * - * Returns the picked CPU with *@is_idle indicating whether the picked CPU is - * currently idle and thus a good candidate for direct dispatching. - */ -__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *is_idle) -{ - if (!check_builtin_idle_enabled()) - goto prev_cpu; - - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) - goto prev_cpu; - -#ifdef CONFIG_SMP - return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -#endif - -prev_cpu: - *is_idle = false; - return prev_cpu; -} - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) -BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) -BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) - -static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { - .owner = THIS_MODULE, - .set = &scx_kfunc_ids_select_cpu, -}; - static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) { if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) @@ -7458,142 +6881,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) */ } -/** - * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking - * per-CPU cpumask. - * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. - */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) -{ - if (!check_builtin_idle_enabled()) - return cpu_none_mask; - -#ifdef CONFIG_SMP - return idle_masks.cpu; -#else - return cpu_none_mask; -#endif -} - -/** - * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, - * per-physical-core cpumask. Can be used to determine if an entire physical - * core is free. - * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. - */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) -{ - if (!check_builtin_idle_enabled()) - return cpu_none_mask; - -#ifdef CONFIG_SMP - if (sched_smt_active()) - return idle_masks.smt; - else - return idle_masks.cpu; -#else - return cpu_none_mask; -#endif -} - -/** - * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to - * either the percpu, or SMT idle-tracking cpumask. - * @idle_mask: &cpumask to use - */ -__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) -{ - /* - * Empty function body because we aren't actually acquiring or releasing - * a reference to a global idle cpumask, which is read-only in the - * caller and is never released. The acquire / release semantics here - * are just used to make the cpumask a trusted pointer in the caller. - */ -} - -/** - * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state - * @cpu: cpu to test and clear idle for - * - * Returns %true if @cpu was idle and its idle state was successfully cleared. - * %false otherwise. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) -{ - if (!check_builtin_idle_enabled()) - return false; - - if (ops_cpu_valid(cpu, NULL)) - return test_and_clear_cpu_idle(cpu); - else - return false; -} - -/** - * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu - * number on success. -%EBUSY if no matching cpu was found. - * - * Idle CPU tracking may race against CPU scheduling state transitions. For - * example, this function may return -%EBUSY as CPUs are transitioning into the - * idle state. If the caller then assumes that there will be dispatch events on - * the CPUs as they were all busy, the scheduler may end up stalling with CPUs - * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and - * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch - * event in the near future. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - if (!check_builtin_idle_enabled()) - return -EBUSY; - - return scx_pick_idle_cpu(cpus_allowed, flags); -} - -/** - * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any - * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu - * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is - * empty. - * - * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not - * set, this function can't tell which CPUs are idle and will always pick any - * CPU. - */ -__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - s32 cpu; - - if (static_branch_likely(&scx_builtin_idle_enabled)) { - cpu = scx_pick_idle_cpu(cpus_allowed, flags); - if (cpu >= 0) - return cpu; - } - - cpu = cpumask_any_distribute(cpus_allowed); - if (cpu < nr_cpu_ids) - return cpu; - else - return -EBUSY; -} - /** * scx_bpf_task_running - Is task currently running? * @p: task of interest @@ -7769,8 +7056,6 @@ static int __init scx_init(void) * check using scx_kf_allowed(). */ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, - &scx_kfunc_set_select_cpu)) || - (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_enqueue_dispatch)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_dispatch)) || @@ -7790,6 +7075,12 @@ static int __init scx_init(void) return ret; } + ret = scx_idle_init(); + if (ret) { + pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); + return ret; + } + ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); if (ret) { pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c new file mode 100644 index 0000000000000..cb981956005b4 --- /dev/null +++ b/kernel/sched/ext_idle.c @@ -0,0 +1,752 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Built-in idle CPU tracking policy. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + * Copyright (c) 2024 Andrea Righi + */ +#include "ext_idle.h" + +/* Enable/disable built-in idle CPU selection policy */ +DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); + +#ifdef CONFIG_SMP +#ifdef CONFIG_CPUMASK_OFFSTACK +#define CL_ALIGNED_IF_ONSTACK +#else +#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp +#endif + +/* Enable/disable LLC aware optimizations */ +DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); + +/* Enable/disable NUMA aware optimizations */ +DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); + +static struct { + cpumask_var_t cpu; + cpumask_var_t smt; +} idle_masks CL_ALIGNED_IF_ONSTACK; + +bool scx_idle_test_and_clear_cpu(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + /* + * SMT mask should be cleared whether we can claim @cpu or not. The SMT + * cluster is not wholly idle either way. This also prevents + * scx_pick_idle_cpu() from getting caught in an infinite loop. + */ + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + + /* + * If offline, @cpu is not its own sibling and + * scx_pick_idle_cpu() can get caught in an infinite loop as + * @cpu is never cleared from idle_masks.smt. Ensure that @cpu + * is eventually cleared. + * + * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to + * reduce memory writes, which may help alleviate cache + * coherence pressure. + */ + if (cpumask_intersects(smt, idle_masks.smt)) + cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); + else if (cpumask_test_cpu(cpu, idle_masks.smt)) + __cpumask_clear_cpu(cpu, idle_masks.smt); + } +#endif + return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); +} + +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) +{ + int cpu; + +retry: + if (sched_smt_active()) { + cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); + if (cpu < nr_cpu_ids) + goto found; + + if (flags & SCX_PICK_IDLE_CORE) + return -EBUSY; + } + + cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); + if (cpu >= nr_cpu_ids) + return -EBUSY; + +found: + if (scx_idle_test_and_clear_cpu(cpu)) + return cpu; + else + goto retry; +} + +/* + * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC + * domain is not defined). + */ +static unsigned int llc_weight(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sd->span_weight; +} + +/* + * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC + * domain is not defined). + */ +static struct cpumask *llc_span(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sched_domain_span(sd); +} + +/* + * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the + * NUMA domain is not defined). + */ +static unsigned int numa_weight(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return 0; + sg = sd->groups; + if (!sg) + return 0; + + return sg->group_weight; +} + +/* + * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA + * domain is not defined). + */ +static struct cpumask *numa_span(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return NULL; + sg = sd->groups; + if (!sg) + return NULL; + + return sched_group_span(sg); +} + +/* + * Return true if the LLC domains do not perfectly overlap with the NUMA + * domains, false otherwise. + */ +static bool llc_numa_mismatch(void) +{ + int cpu; + + /* + * We need to scan all online CPUs to verify whether their scheduling + * domains overlap. + * + * While it is rare to encounter architectures with asymmetric NUMA + * topologies, CPU hotplugging or virtualized environments can result + * in asymmetric configurations. + * + * For example: + * + * NUMA 0: + * - LLC 0: cpu0..cpu7 + * - LLC 1: cpu8..cpu15 [offline] + * + * NUMA 1: + * - LLC 0: cpu16..cpu23 + * - LLC 1: cpu24..cpu31 + * + * In this case, if we only check the first online CPU (cpu0), we might + * incorrectly assume that the LLC and NUMA domains are fully + * overlapping, which is incorrect (as NUMA 1 has two distinct LLC + * domains). + */ + for_each_online_cpu(cpu) + if (llc_weight(cpu) != numa_weight(cpu)) + return true; + + return false; +} + +/* + * Initialize topology-aware scheduling. + * + * Detect if the system has multiple LLC or multiple NUMA domains and enable + * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle + * selection policy. + * + * Assumption: the kernel's internal topology representation assumes that each + * CPU belongs to a single LLC domain, and that each LLC domain is entirely + * contained within a single NUMA node. + */ +void scx_idle_update_selcpu_topology(void) +{ + bool enable_llc = false, enable_numa = false; + unsigned int nr_cpus; + s32 cpu = cpumask_first(cpu_online_mask); + + /* + * Enable LLC domain optimization only when there are multiple LLC + * domains among the online CPUs. If all online CPUs are part of a + * single LLC domain, the idle CPU selection logic can choose any + * online CPU without bias. + * + * Note that it is sufficient to check the LLC domain of the first + * online CPU to determine whether a single LLC domain includes all + * CPUs. + */ + rcu_read_lock(); + nr_cpus = llc_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus()) + enable_llc = true; + pr_debug("sched_ext: LLC=%*pb weight=%u\n", + cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); + } + + /* + * Enable NUMA optimization only when there are multiple NUMA domains + * among the online CPUs and the NUMA domains don't perfectly overlaps + * with the LLC domains. + * + * If all CPUs belong to the same NUMA node and the same LLC domain, + * enabling both NUMA and LLC optimizations is unnecessary, as checking + * for an idle CPU in the same domain twice is redundant. + */ + nr_cpus = numa_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) + enable_numa = true; + pr_debug("sched_ext: NUMA=%*pb weight=%u\n", + cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); + } + rcu_read_unlock(); + + pr_debug("sched_ext: LLC idle selection %s\n", + str_enabled_disabled(enable_llc)); + pr_debug("sched_ext: NUMA idle selection %s\n", + str_enabled_disabled(enable_numa)); + + if (enable_llc) + static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); + if (enable_numa) + static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); +} + +/* + * Built-in CPU idle selection policy: + * + * 1. Prioritize full-idle cores: + * - always prioritize CPUs from fully idle cores (both logical CPUs are + * idle) to avoid interference caused by SMT. + * + * 2. Reuse the same CPU: + * - prefer the last used CPU to take advantage of cached data (L1, L2) and + * branch prediction optimizations. + * + * 3. Pick a CPU within the same LLC (Last-Level Cache): + * - if the above conditions aren't met, pick a CPU that shares the same LLC + * to maintain cache locality. + * + * 4. Pick a CPU within the same NUMA node, if enabled: + * - choose a CPU from the same NUMA node to reduce memory access latency. + * + * 5. Pick any idle CPU usable by the task. + * + * Step 3 and 4 are performed only if the system has, respectively, multiple + * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and + * scx_selcpu_topo_numa). + * + * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because + * we never call ops.select_cpu() for them, see select_task_rq(). + */ +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *found) +{ + const struct cpumask *llc_cpus = NULL; + const struct cpumask *numa_cpus = NULL; + s32 cpu; + + *found = false; + + /* + * This is necessary to protect llc_cpus. + */ + rcu_read_lock(); + + /* + * Determine the scheduling domain only if the task is allowed to run + * on all CPUs. + * + * This is done primarily for efficiency, as it avoids the overhead of + * updating a cpumask every time we need to select an idle CPU (which + * can be costly in large SMP systems), but it also aligns logically: + * if a task's scheduling domain is restricted by user-space (through + * CPU affinity), the task will simply use the flat scheduling domain + * defined by user-space. + */ + if (p->nr_cpus_allowed >= num_possible_cpus()) { + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) + numa_cpus = numa_span(prev_cpu); + + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) + llc_cpus = llc_span(prev_cpu); + } + + /* + * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. + */ + if (wake_flags & SCX_WAKE_SYNC) { + cpu = smp_processor_id(); + + /* + * If the waker's CPU is cache affine and prev_cpu is idle, + * then avoid a migration. + */ + if (cpus_share_cache(cpu, prev_cpu) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } + + /* + * If the waker's local DSQ is empty, and the system is under + * utilized, try to wake up @p to the local DSQ of the waker. + * + * Checking only for an empty local DSQ is insufficient as it + * could give the wakee an unfair advantage when the system is + * oversaturated. + * + * Checking only for the presence of idle CPUs is also + * insufficient as the local DSQ of the waker could have tasks + * piled up on it even if there is an idle core elsewhere on + * the system. + */ + if (!cpumask_empty(idle_masks.cpu) && + !(current->flags & PF_EXITING) && + cpu_rq(cpu)->scx.local_dsq.nr == 0) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) + goto cpu_found; + } + } + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if (sched_smt_active()) { + /* + * Keep using @prev_cpu if it's part of a fully idle core. + */ + if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } + + /* + * Search for any fully idle core in the same LLC domain. + */ + if (llc_cpus) { + cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any fully idle core in the same NUMA node. + */ + if (numa_cpus) { + cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any full idle core usable by the task. + */ + cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Use @prev_cpu if it's idle. + */ + if (scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } + + /* + * Search for any idle CPU in the same LLC domain. + */ + if (llc_cpus) { + cpu = scx_pick_idle_cpu(llc_cpus, 0); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any idle CPU in the same NUMA node. + */ + if (numa_cpus) { + cpu = scx_pick_idle_cpu(numa_cpus, 0); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any idle CPU usable by the task. + */ + cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto cpu_found; + + rcu_read_unlock(); + return prev_cpu; + +cpu_found: + rcu_read_unlock(); + + *found = true; + return cpu; +} + +void scx_idle_reset_masks(void) +{ + /* + * Consider all online cpus idle. Should converge to the actual state + * quickly. + */ + cpumask_copy(idle_masks.cpu, cpu_online_mask); + cpumask_copy(idle_masks.smt, cpu_online_mask); +} + +void scx_idle_init_masks(void) +{ + BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); +} + +static void update_builtin_idle(int cpu, bool idle) +{ + assign_cpu(cpu, idle_masks.cpu, idle); + +#ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + + if (idle) { + /* + * idle_masks.smt handling is racy but that's fine as + * it's only for optimization and self-correcting. + */ + if (!cpumask_subset(smt, idle_masks.cpu)) + return; + cpumask_or(idle_masks.smt, idle_masks.smt, smt); + } else { + cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); + } + } +#endif +} + +/* + * Update the idle state of a CPU to @idle. + * + * If @do_notify is true, ops.update_idle() is invoked to notify the scx + * scheduler of an actual idle state transition (idle to busy or vice + * versa). If @do_notify is false, only the idle state in the idle masks is + * refreshed without invoking ops.update_idle(). + * + * This distinction is necessary, because an idle CPU can be "reserved" and + * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as + * busy even if no tasks are dispatched. In this case, the CPU may return + * to idle without a true state transition. Refreshing the idle masks + * without invoking ops.update_idle() ensures accurate idle state tracking + * while avoiding unnecessary updates and maintaining balanced state + * transitions. + */ +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + */ + if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + + /* + * Update the idle masks: + * - for real idle transitions (do_notify == true) + * - for idle-to-idle transitions (indicated by the previous task + * being the idle thread, managed by pick_task_idle()) + * + * Skip updating idle masks if the previous task is not the idle + * thread, since set_next_task_idle() has already handled it when + * transitioning from a task to the idle thread (calling this + * function with do_notify == true). + * + * In this way we can avoid updating the idle masks twice, + * unnecessarily. + */ + if (static_branch_likely(&scx_builtin_idle_enabled)) + if (do_notify || is_idle_task(rq->curr)) + update_builtin_idle(cpu, idle); +} +#endif /* CONFIG_SMP */ + +/******************************************************************************** + * Helpers that can be called from the BPF scheduler. + */ +__bpf_kfunc_start_defs(); + +static bool check_builtin_idle_enabled(void) +{ + if (static_branch_likely(&scx_builtin_idle_enabled)) + return true; + + scx_ops_error("built-in idle tracking is disabled"); + return false; +} + +/** + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @is_idle: out parameter indicating whether the returned CPU is idle + * + * Can only be called from ops.select_cpu() if the built-in CPU selection is + * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. + * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * + * Returns the picked CPU with *@is_idle indicating whether the picked CPU is + * currently idle and thus a good candidate for direct dispatching. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *is_idle) +{ + if (!check_builtin_idle_enabled()) + goto prev_cpu; + + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) + goto prev_cpu; + +#ifdef CONFIG_SMP + return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); +#endif + +prev_cpu: + *is_idle = false; + return prev_cpu; +} + +/** + * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking + * per-CPU cpumask. + * + * Returns NULL if idle tracking is not enabled, or running on a UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) +{ + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_masks.cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, + * per-physical-core cpumask. Can be used to determine if an entire physical + * core is free. + * + * Returns NULL if idle tracking is not enabled, or running on a UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) +{ + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_masks.smt; + else + return idle_masks.cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to + * either the percpu, or SMT idle-tracking cpumask. + * @idle_mask: &cpumask to use + */ +__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) +{ + /* + * Empty function body because we aren't actually acquiring or releasing + * a reference to a global idle cpumask, which is read-only in the + * caller and is never released. The acquire / release semantics here + * are just used to make the cpumask a trusted pointer in the caller. + */ +} + +/** + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state + * @cpu: cpu to test and clear idle for + * + * Returns %true if @cpu was idle and its idle state was successfully cleared. + * %false otherwise. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +{ + if (!check_builtin_idle_enabled()) + return false; + + if (ops_cpu_valid(cpu, NULL)) + return scx_idle_test_and_clear_cpu(cpu); + else + return false; +} + +/** + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu + * number on success. -%EBUSY if no matching cpu was found. + * + * Idle CPU tracking may race against CPU scheduling state transitions. For + * example, this function may return -%EBUSY as CPUs are transitioning into the + * idle state. If the caller then assumes that there will be dispatch events on + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch + * event in the near future. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + if (!check_builtin_idle_enabled()) + return -EBUSY; + + return scx_pick_idle_cpu(cpus_allowed, flags); +} + +/** + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + s32 cpu; + + if (static_branch_likely(&scx_builtin_idle_enabled)) { + cpu = scx_pick_idle_cpu(cpus_allowed, flags); + if (cpu >= 0) + return cpu; + } + + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_idle) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_idle) + +static const struct btf_kfunc_id_set scx_kfunc_set_idle = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_idle, +}; + +BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) + +static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_select_cpu, +}; + +int scx_idle_init(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle); + + return ret; +} diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h new file mode 100644 index 0000000000000..7a13a74815ba7 --- /dev/null +++ b/kernel/sched/ext_idle.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + * Copyright (c) 2024 Andrea Righi + */ +#ifndef _KERNEL_SCHED_EXT_IDLE_H +#define _KERNEL_SCHED_EXT_IDLE_H + +extern struct static_key_false scx_builtin_idle_enabled; + +#ifdef CONFIG_SMP +extern struct static_key_false scx_selcpu_topo_llc; +extern struct static_key_false scx_selcpu_topo_numa; + +void scx_idle_update_selcpu_topology(void); +void scx_idle_reset_masks(void); +void scx_idle_init_masks(void); +bool scx_idle_test_and_clear_cpu(int cpu); +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags); +#else /* !CONFIG_SMP */ +static inline void scx_idle_update_selcpu_topology(void) {} +static inline void scx_idle_reset_masks(void) {} +static inline void scx_idle_init_masks(void) {} +static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } +static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) +{ + return -EBUSY; +} +#endif /* CONFIG_SMP */ + +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *found); + +extern int scx_idle_init(void); + +#endif /* _KERNEL_SCHED_EXT_IDLE_H */ -- GitLab From ad6c08d8c1045843ec564a73981ece6ec31d11a0 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 27 Jan 2025 23:52:14 +0000 Subject: [PATCH 004/934] cgroup/misc: Remove unused misc_cg_res_total_usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit misc_cg_res_total_usage() was added in 2021 by commit a72232eabdfc ("cgroup: Add misc cgroup controller") but has remained unused. Remove it. Signed-off-by: Dr. David Alan Gilbert Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 6 ------ kernel/cgroup/misc.c | 16 ---------------- 2 files changed, 22 deletions(-) diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 49eef10c8e597..4bf261d41a6d2 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -60,7 +60,6 @@ struct misc_cg { struct misc_res res[MISC_CG_RES_TYPES]; }; -u64 misc_cg_res_total_usage(enum misc_res_type type); int misc_cg_set_capacity(enum misc_res_type type, u64 capacity); int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount); void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount); @@ -104,11 +103,6 @@ static inline void put_misc_cg(struct misc_cg *cg) #else /* !CONFIG_CGROUP_MISC */ -static inline u64 misc_cg_res_total_usage(enum misc_res_type type) -{ - return 0; -} - static inline int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) { return 0; diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 0e26068995a6c..2fa3a4fb2aaf0 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -67,22 +67,6 @@ static inline bool valid_type(enum misc_res_type type) return type >= 0 && type < MISC_CG_RES_TYPES; } -/** - * misc_cg_res_total_usage() - Get the current total usage of the resource. - * @type: misc res type. - * - * Context: Any context. - * Return: Current total usage of the resource. - */ -u64 misc_cg_res_total_usage(enum misc_res_type type) -{ - if (valid_type(type)) - return atomic64_read(&root_cg.res[type].usage); - - return 0; -} -EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); - /** * misc_cg_set_capacity() - Set the capacity of the misc cgroup res. * @type: Type of the misc res. -- GitLab From 17103b8504de68958b0ff412ed2ae2e6484fa65f Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Fri, 31 Jan 2025 16:09:28 +0900 Subject: [PATCH 005/934] sched_ext: Implement event counter infrastructure Collect the statistics of specific types of behavior in the sched_ext core, which are not easily visible but still interesting to an scx scheduler. An event type is defined in 'struct scx_event_stats.' When an event occurs, its counter is accumulated using 'scx_add_event()' and '__scx_add_event()' to per-CPU 'struct scx_event_stats' for efficiency. 'scx_bpf_events()' aggregates all the per-CPU counters and exposes a system-wide counters. For convenience and readability of the code, 'scx_agg_event()' and 'scx_dump_event()' are provided. The collected events can be observed after a BPF scheduler is unloaded beforea new BPF scheduler is loaded so the per-CPU 'struct scx_event_stats' are reset. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 103 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 5f6a425d4ffea..4e28e88e88d48 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1440,6 +1440,64 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return p; } +/* + * Collection of event counters. Event types are placed in descending order. + */ +struct scx_event_stats { +}; + +/* + * The event counter is organized by a per-CPU variable to minimize the + * accounting overhead without synchronization. A system-wide view on the + * event counter is constructed when requested by scx_bpf_get_event_stat(). + */ +static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu); + +/** + * scx_add_event - Increase an event counter for 'name' by 'cnt' + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This can be used when preemption is not disabled. + */ +#define scx_add_event(name, cnt) do { \ + this_cpu_add(event_stats_cpu.name, cnt); \ +} while(0) + +/** + * __scx_add_event - Increase an event counter for 'name' by 'cnt' + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This should be used only when preemption is disabled. + */ +#define __scx_add_event(name, cnt) do { \ + __this_cpu_add(event_stats_cpu.name, cnt); \ +} while(0) + +/** + * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' + * @dst_e: destination event stats + * @src_e: source event stats + * @kind: a kind of event to be aggregated + */ +#define scx_agg_event(dst_e, src_e, kind) do { \ + (dst_e)->kind += READ_ONCE((src_e)->kind); \ +} while(0) + +/** + * scx_dump_event - Dump an event 'kind' in 'events' to 's' + * @s: output seq_buf + * @events: event stats + * @kind: a kind of event to dump + */ +#define scx_dump_event(s, events, kind) do { \ + dump_line(&(s), "%30s: %16llu", #kind, (events)->kind); \ +} while (0) + + +static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz); + static enum scx_ops_enable_state scx_ops_enable_state(void) { return atomic_read(&scx_ops_enable_state_var); @@ -4785,6 +4843,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) .at_jiffies = jiffies, }; struct seq_buf s; + struct scx_event_stats events; unsigned long flags; char *buf; int cpu; @@ -4893,6 +4952,12 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) rq_unlock(rq, &rf); } + dump_newline(&s); + dump_line(&s, "Event counters"); + dump_line(&s, "--------------"); + + scx_bpf_events(&events, sizeof(events)); + if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) memcpy(ei->dump + dump_len - sizeof(trunc_marker), trunc_marker, sizeof(trunc_marker)); @@ -5000,6 +5065,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) mutex_lock(&scx_ops_enable_mutex); + /* + * Clear event counters so a new scx scheduler gets + * fresh event counter values. + */ + for_each_possible_cpu(cpu) { + struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu); + memset(e, 0, sizeof(*e)); + } + if (!scx_ops_helper) { WRITE_ONCE(scx_ops_helper, scx_create_rt_helper("sched_ext_ops_helper")); @@ -7001,6 +7075,34 @@ __bpf_kfunc u64 scx_bpf_now(void) return clock; } +/* + * scx_bpf_events - Get a system-wide event counter to + * @events: output buffer from a BPF program + * @events__sz: @events len, must end in '__sz'' for the verifier + */ +__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, + size_t events__sz) +{ + struct scx_event_stats e_sys, *e_cpu; + int cpu; + + /* Aggregate per-CPU event counters into the system-wide counters. */ + memset(&e_sys, 0, sizeof(e_sys)); + for_each_possible_cpu(cpu) { + e_cpu = per_cpu_ptr(&event_stats_cpu, cpu); + } + + /* + * We cannot entirely trust a BPF-provided size since a BPF program + * might be compiled against a different vmlinux.h, of which + * scx_event_stats would be larger (a newer vmlinux.h) or smaller + * (an older vmlinux.h). Hence, we use the smaller size to avoid + * memory corruption. + */ + events__sz = min(events__sz, sizeof(*events)); + memcpy(events, &e_sys, events__sz); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) @@ -7033,6 +7135,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq) BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif BTF_ID_FLAGS(func, scx_bpf_now) +BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { -- GitLab From f7f6142107f0e0dd4a2b041116461a049ca18cb0 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Fri, 31 Jan 2025 16:09:29 +0900 Subject: [PATCH 006/934] sched_ext: Add an event, SCX_EV_SELECT_CPU_FALLBACK Add a core event, SCX_EV_SELECT_CPU_FALLBACK, which represents how many times ops.select_cpu() returns a CPU that the task can't use. __scx_add_event() is used since the caller holds an rq lock, so the preemption has already been disabled. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + kernel/sched/ext.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 1d70a9867fb12..f7545430a5482 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -146,6 +146,7 @@ struct sched_ext_entity { u32 weight; s32 sticky_cpu; s32 holding_cpu; + s32 selected_cpu; u32 kf_mask; /* see scx_kf_mask above */ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ atomic_long_t ops_state; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4e28e88e88d48..f80c8dc024a7a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1444,6 +1444,11 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * Collection of event counters. Event types are placed in descending order. */ struct scx_event_stats { + /* + * If ops.select_cpu() returns a CPU which can't be used by the task, + * the core scheduler code silently picks a fallback CPU. + */ + u64 SCX_EV_SELECT_CPU_FALLBACK; }; /* @@ -2170,6 +2175,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags do_enqueue_task(rq, p, enq_flags, sticky_cpu); out: rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; + + if ((enq_flags & SCX_ENQ_CPU_SELECTED) && + unlikely(cpu_of(rq) != p->scx.selected_cpu)) + __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1); } static void ops_dequeue(struct task_struct *p, u64 deq_flags) @@ -3240,6 +3249,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, select_cpu, p, prev_cpu, wake_flags); + p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; if (ops_cpu_valid(cpu, "from ops.select_cpu()")) return cpu; @@ -3250,6 +3260,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag s32 cpu; cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); + p->scx.selected_cpu = cpu; if (found) { p->scx.slice = SCX_SLICE_DFL; p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; @@ -4957,6 +4968,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) dump_line(&s, "--------------"); scx_bpf_events(&events, sizeof(events)); + scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) memcpy(ei->dump + dump_len - sizeof(trunc_marker), @@ -7090,6 +7102,7 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, memset(&e_sys, 0, sizeof(e_sys)); for_each_possible_cpu(cpu) { e_cpu = per_cpu_ptr(&event_stats_cpu, cpu); + scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); } /* -- GitLab From 9be0a1b0c8fbd21ef6d7f9428a76b759f9db0de3 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Fri, 31 Jan 2025 16:09:30 +0900 Subject: [PATCH 007/934] sched_ext: Add an event, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE Add a core event, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, which represents how many times a BPF scheduler tries to dispatch to an offlined local DSQ. __scx_add_event() is used since the caller holds an rq lock, so the preemption has already been disabled. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f80c8dc024a7a..d1447f18381ee 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1449,6 +1449,12 @@ struct scx_event_stats { * the core scheduler code silently picks a fallback CPU. */ u64 SCX_EV_SELECT_CPU_FALLBACK; + + /* + * When dispatching to a local DSQ, the CPU may have gone offline in + * the meantime. In this case, the task is bounced to the global DSQ. + */ + u64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; }; /* @@ -2643,6 +2649,7 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dispatch_enqueue(find_global_dsq(p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); + __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return; } @@ -4969,6 +4976,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) scx_bpf_events(&events, sizeof(events)); scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); + scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) memcpy(ei->dump + dump_len - sizeof(trunc_marker), @@ -7103,6 +7111,7 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, for_each_possible_cpu(cpu) { e_cpu = per_cpu_ptr(&event_stats_cpu, cpu); scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); + scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); } /* -- GitLab From 7125660bc16b7e87665bc859e7337388bdb63e0a Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Fri, 31 Jan 2025 16:09:31 +0900 Subject: [PATCH 008/934] sched_ext: Add an event, SCX_EV_DISPATCH_KEEP_LAST Add a core event, SCX_EV_DISPATCH_KEEP_LAST, which represents how many times a task is continued to run without ops.enqueue() when SCX_OPS_ENQ_LAST is not set. __scx_add_event() is used since the caller holds an rq lock, so the preemption has already been disabled. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index d1447f18381ee..cbb3d059072fb 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1455,6 +1455,12 @@ struct scx_event_stats { * the meantime. In this case, the task is bounced to the global DSQ. */ u64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; + + /* + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task + * continued to run because there were no other tasks on the CPU. + */ + u64 SCX_EV_DISPATCH_KEEP_LAST; }; /* @@ -2907,6 +2913,7 @@ no_tasks: if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) || scx_rq_bypassing(rq))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; + __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1); goto has_tasks; } rq->scx.flags &= ~SCX_RQ_IN_BALANCE; @@ -4977,6 +4984,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) scx_bpf_events(&events, sizeof(events)); scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) memcpy(ei->dump + dump_len - sizeof(trunc_marker), @@ -7112,6 +7120,7 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, e_cpu = per_cpu_ptr(&event_stats_cpu, cpu); scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); } /* -- GitLab From 003c0414318a1829a1a5b195ad81e8a7960c3f5d Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:30 +0000 Subject: [PATCH 009/934] perf/amd/ibs: Remove IBS_{FETCH|OP}_CONFIG_MASK macros Definition of these macros are very simple and they are used at only one place. Get rid of unnecessary redirection. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-2-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index e7a8b8758e088..4ca8006d22210 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -28,9 +28,6 @@ static u32 ibs_caps; #include #include -#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) -#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT - /* attr.config2 */ #define IBS_SW_FILTER_MASK 1 @@ -688,7 +685,7 @@ static struct perf_ibs perf_ibs_fetch = { .read = perf_ibs_read, }, .msr = MSR_AMD64_IBSFETCHCTL, - .config_mask = IBS_FETCH_CONFIG_MASK, + .config_mask = IBS_FETCH_MAX_CNT | IBS_FETCH_RAND_EN, .cnt_mask = IBS_FETCH_MAX_CNT, .enable_mask = IBS_FETCH_ENABLE, .valid_mask = IBS_FETCH_VAL, @@ -711,7 +708,7 @@ static struct perf_ibs perf_ibs_op = { .read = perf_ibs_read, }, .msr = MSR_AMD64_IBSOPCTL, - .config_mask = IBS_OP_CONFIG_MASK, + .config_mask = IBS_OP_MAX_CNT, .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | IBS_OP_CUR_CNT_RAND, .enable_mask = IBS_OP_ENABLE, -- GitLab From 88c7bcad71c83f52f24108dedcecae0d18dbc627 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:31 +0000 Subject: [PATCH 010/934] perf/amd/ibs: Remove pointless sample period check Valid perf event sample period value for IBS PMUs (Fetch and Op both) is limited to multiple of 0x10. perf_ibs_init() has this check: if (!event->attr.sample_freq && hwc->sample_period & 0x0f) return -EINVAL; But it's broken since hwc->sample_period will always be 0 when event->attr.sample_freq is 0 (irrespective of event->attr.freq value.) One option to fix this is to change the condition: - if (!event->attr.sample_freq && hwc->sample_period & 0x0f) + if (!event->attr.freq && hwc->sample_period & 0x0f) However, that will break all userspace tools which have been using IBS event with sample_period not multiple of 0x10. Another option is to remove the condition altogether and mask lower nibble _silently_, same as what current code is inadvertently doing. I'm preferring this approach as it keeps the existing behavior. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-3-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 4ca8006d22210..bd8919e7c3b17 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -307,13 +307,8 @@ static int perf_ibs_init(struct perf_event *event) if (config & perf_ibs->cnt_mask) /* raw max_cnt may not be set */ return -EINVAL; - if (!event->attr.sample_freq && hwc->sample_period & 0x0f) - /* - * lower 4 bits can not be set in ibs max cnt, - * but allowing it in case we adjust the - * sample period to set a frequency. - */ - return -EINVAL; + + /* Silently mask off lower nibble. IBS hw mandates it. */ hwc->sample_period &= ~0x0FULL; if (!hwc->sample_period) hwc->sample_period = 0x10; -- GitLab From 598bdf4fefff5af4ce6d26d16f7b2a20808fc4cb Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:32 +0000 Subject: [PATCH 011/934] perf/amd/ibs: Fix ->config to sample period calculation for OP PMU Instead of using standard perf_event_attr->freq=0 and ->sample_period fields, IBS event in 'sample period mode' can also be opened by setting period value directly in perf_event_attr->config in a MaxCnt bit-field format. IBS OP MaxCnt bits are defined as: (high bits) IbsOpCtl[26:20] = IbsOpMaxCnt[26:20] (low bits) IbsOpCtl[15:0] = IbsOpMaxCnt[19:4] Perf event sample period can be derived from MaxCnt bits as: sample_period = (high bits) | ((low_bits) << 4); However, current code just masks MaxCnt bits and shifts all of them, including high bits, which is incorrect. Fix it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-4-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index bd8919e7c3b17..f95542b75b916 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -271,7 +271,7 @@ static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs; - u64 max_cnt, config; + u64 config; int ret; perf_ibs = get_ibs_pmu(event->attr.type); @@ -313,10 +313,19 @@ static int perf_ibs_init(struct perf_event *event) if (!hwc->sample_period) hwc->sample_period = 0x10; } else { - max_cnt = config & perf_ibs->cnt_mask; + u64 period = 0; + + if (perf_ibs == &perf_ibs_op) { + period = (config & IBS_OP_MAX_CNT) << 4; + if (ibs_caps & IBS_CAPS_OPCNTEXT) + period |= config & IBS_OP_MAX_CNT_EXT_MASK; + } else { + period = (config & IBS_FETCH_MAX_CNT) << 4; + } + config &= ~perf_ibs->cnt_mask; - event->attr.sample_period = max_cnt << 4; - hwc->sample_period = event->attr.sample_period; + event->attr.sample_period = period; + hwc->sample_period = period; } if (!hwc->sample_period) -- GitLab From 46dcf85566170d4528b842bf83ffc350d71771fa Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:33 +0000 Subject: [PATCH 012/934] perf/amd/ibs: Fix perf_ibs_op.cnt_mask for CurCnt IBS Op uses two counters: MaxCnt and CurCnt. MaxCnt is programmed with the desired sample period. IBS hw generates sample when CurCnt reaches to MaxCnt. The size of these counter used to be 20 bits but later they were extended to 27 bits. The 7 bit extension is indicated by CPUID Fn8000_001B_EAX[6 / OpCntExt]. perf_ibs->cnt_mask variable contains bit masks for MaxCnt and CurCnt. But IBS driver does not set upper 7 bits of CurCnt in cnt_mask even when OpCntExt CPUID bit is set. Fix this. IBS driver uses cnt_mask[CurCnt] bits only while disabling an event. Fortunately, CurCnt bits are not read from MSR while re-enabling the event, instead MaxCnt is programmed with desired period and CurCnt is set to 0. Hence, we did not see any issues so far. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-5-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 3 ++- arch/x86/include/asm/perf_event.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index f95542b75b916..d9c84f1d530f1 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1245,7 +1245,8 @@ static __init int perf_ibs_op_init(void) if (ibs_caps & IBS_CAPS_OPCNTEXT) { perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; - perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.cnt_mask |= (IBS_OP_MAX_CNT_EXT_MASK | + IBS_OP_CUR_CNT_EXT_MASK); } if (ibs_caps & IBS_CAPS_ZEN4) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 1ac79f3616456..ee5581768fea7 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -514,6 +514,7 @@ struct pebs_xmm { */ #define IBS_OP_CUR_CNT (0xFFF80ULL<<32) #define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32) +#define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52) #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) -- GitLab From e1e7844ced88f9558a48579390a7d4eaac6a28eb Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:34 +0000 Subject: [PATCH 013/934] perf/amd/ibs: Don't allow freq mode event creation through ->config interface Most perf_event_attr->config bits directly maps to IBS_{FETCH|OP}_CTL MSR. Since the sample period is programmed in these control registers, IBS PMU driver allows opening an IBS event by setting sample period value directly in perf_event_attr->config instead of using explicit perf_event_attr->sample_period interface. However, this logic is not applicable for freq mode events since the semantics of control register fields are applicable only to fixed sample period whereas the freq mode event adjusts sample period after each and every sample. Currently, IBS driver (unintentionally) allows creating freq mode event via ->config interface, which is semantically wrong as well as detrimental because it can be misused to bypass perf_event_max_sample_rate checks. Don't allow freq mode event creation through perf_event_attr->config interface. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-6-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index d9c84f1d530f1..3e7ca1e2f25e3 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -315,6 +315,9 @@ static int perf_ibs_init(struct perf_event *event) } else { u64 period = 0; + if (event->attr.freq) + return -EINVAL; + if (perf_ibs == &perf_ibs_op) { period = (config & IBS_OP_MAX_CNT) << 4; if (ibs_caps & IBS_CAPS_OPCNTEXT) -- GitLab From b2fc7b282bf7c1253b01c8da84e894539a3e709d Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:35 +0000 Subject: [PATCH 014/934] perf/amd/ibs: Add PMU specific minimum period 0x10 is the minimum sample period for IBS Fetch and 0x90 for IBS Op. Current IBS PMU driver uses 0x10 for both the PMUs, which is incorrect. Fix it by adding PMU specific minimum period values in struct perf_ibs. Also, bail out opening a 'sample period mode' event if the user requested sample period is less than PMU supported minimum value. For a 'freq mode' event, start calibrating sample period from PMU specific minimum period. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-7-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 3e7ca1e2f25e3..7b54b76d39f5d 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -86,6 +86,7 @@ struct perf_ibs { u64 cnt_mask; u64 enable_mask; u64 valid_mask; + u16 min_period; u64 max_period; unsigned long offset_mask[1]; int offset_max; @@ -308,10 +309,14 @@ static int perf_ibs_init(struct perf_event *event) /* raw max_cnt may not be set */ return -EINVAL; - /* Silently mask off lower nibble. IBS hw mandates it. */ - hwc->sample_period &= ~0x0FULL; - if (!hwc->sample_period) - hwc->sample_period = 0x10; + if (event->attr.freq) { + hwc->sample_period = perf_ibs->min_period; + } else { + /* Silently mask off lower nibble. IBS hw mandates it. */ + hwc->sample_period &= ~0x0FULL; + if (hwc->sample_period < perf_ibs->min_period) + return -EINVAL; + } } else { u64 period = 0; @@ -329,10 +334,10 @@ static int perf_ibs_init(struct perf_event *event) config &= ~perf_ibs->cnt_mask; event->attr.sample_period = period; hwc->sample_period = period; - } - if (!hwc->sample_period) - return -EINVAL; + if (hwc->sample_period < perf_ibs->min_period) + return -EINVAL; + } /* * If we modify hwc->sample_period, we also need to update @@ -353,7 +358,8 @@ static int perf_ibs_set_period(struct perf_ibs *perf_ibs, int overflow; /* ignore lower 4 bits in min count: */ - overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + overflow = perf_event_set_period(hwc, perf_ibs->min_period, + perf_ibs->max_period, period); local64_set(&hwc->prev_count, 0); return overflow; @@ -696,6 +702,7 @@ static struct perf_ibs perf_ibs_fetch = { .cnt_mask = IBS_FETCH_MAX_CNT, .enable_mask = IBS_FETCH_ENABLE, .valid_mask = IBS_FETCH_VAL, + .min_period = 0x10, .max_period = IBS_FETCH_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, @@ -720,6 +727,7 @@ static struct perf_ibs perf_ibs_op = { IBS_OP_CUR_CNT_RAND, .enable_mask = IBS_OP_ENABLE, .valid_mask = IBS_OP_VAL, + .min_period = 0x90, .max_period = IBS_OP_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, .offset_max = MSR_AMD64_IBSOP_REG_COUNT, -- GitLab From 1afbdd970f50f2e0431fae26b25d4e54e561fa7f Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:36 +0000 Subject: [PATCH 015/934] perf/amd/ibs: Add ->check_period() callback IBS Fetch and IBS Op PMUs have constraints on sample period. The sample period is verified at the time of opening an event but not at the ioctl() interface. Hence, a user can open an event with valid period but change it later with ioctl(). Add a ->check_period() callback to verify the period provided at ioctl() is also valid. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-8-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 7b54b76d39f5d..aea893a971b6e 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -564,6 +564,28 @@ static void perf_ibs_del(struct perf_event *event, int flags) static void perf_ibs_read(struct perf_event *event) { } +static int perf_ibs_check_period(struct perf_event *event, u64 value) +{ + struct perf_ibs *perf_ibs; + u64 low_nibble; + + if (event->attr.freq) + return 0; + + perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + low_nibble = value & 0xFULL; + + /* + * This contradicts with perf_ibs_init() which allows sample period + * with lower nibble bits set but silently masks them off. Whereas + * this returns error. + */ + if (low_nibble || value < perf_ibs->min_period) + return -EINVAL; + + return 0; +} + /* * We need to initialize with empty group if all attributes in the * group are dynamic. @@ -696,6 +718,7 @@ static struct perf_ibs perf_ibs_fetch = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, + .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSFETCHCTL, .config_mask = IBS_FETCH_MAX_CNT | IBS_FETCH_RAND_EN, @@ -720,6 +743,7 @@ static struct perf_ibs perf_ibs_op = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, + .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_MAX_CNT, -- GitLab From fa5d0a824e3bbd1f793d962f9e012ab0a8ee11c5 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:37 +0000 Subject: [PATCH 016/934] perf/amd/ibs: Ceil sample_period to min_period The sample_period needs to be recalibrated after every sample to match the desired sampling freq for a 'freq mode event'. Since the next sample_period is calculated by generic kernel, PMU specific constraints are not (explicitly) reckoned. The sample_period value is programmed in a MaxCnt field of IBS PMUs, and the MaxCnt field has following constraints: 1) MaxCnt must be multiple of 0x10. Kernel keeps track of residual / over-counted period into period_left, which should take care of this constraint by programming MaxCnt with (sample_period & ~0xF) and adding remaining period into the next sample. 2) MaxCnt must be >= 0x10 for IBS Fetch PMU and >= 0x90 for IBS Op PMU. Currently, IBS PMU driver allows sample_period below min_period, which is an undefined HW behavior. Reset sample_period to min_period whenever it's less than that. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250115054438.1021-9-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index aea893a971b6e..7978d7910adc3 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -457,6 +457,9 @@ static void perf_ibs_start(struct perf_event *event, int flags) WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); hwc->state = 0; + if (event->attr.freq && hwc->sample_period < perf_ibs->min_period) + hwc->sample_period = perf_ibs->min_period; + perf_ibs_set_period(perf_ibs, hwc, &period); if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { config |= period & IBS_OP_MAX_CNT_EXT_MASK; @@ -1191,6 +1194,10 @@ fail: perf_sample_save_callchain(&data, event, iregs); throttle = perf_event_overflow(event, &data, ®s); + + if (event->attr.freq && hwc->sample_period < perf_ibs->min_period) + hwc->sample_period = perf_ibs->min_period; + out: if (throttle) { perf_ibs_stop(event, 0); -- GitLab From eae8a56ae0c74c1cf2f92a6709d215a9f329f60c Mon Sep 17 00:00:00 2001 From: Liao Chang Date: Fri, 24 Jan 2025 09:38:25 +0000 Subject: [PATCH 017/934] uprobes: Remove redundant spinlock in uprobe_deny_signal() Since clearing a bit in thread_info is an atomic operation, the spinlock is redundant and can be removed, reducing lock contention is good for performance. Signed-off-by: Liao Chang Signed-off-by: Peter Zijlstra (Intel) Acked-by: "Masami Hiramatsu (Google)" Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250124093826.2123675-2-liaochang1@huawei.com --- kernel/events/uprobes.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2ca797cbe465f..33bd6083f7c40 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2302,9 +2302,7 @@ bool uprobe_deny_signal(void) WARN_ON_ONCE(utask->state != UTASK_SSTEP); if (task_sigpending(t)) { - spin_lock_irq(&t->sighand->siglock); clear_tsk_thread_flag(t, TIF_SIGPENDING); - spin_unlock_irq(&t->sighand->siglock); if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; -- GitLab From 0de64754a55470adb0a870f3105c8922334bb6fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 23 Jan 2025 21:10:42 +0100 Subject: [PATCH 018/934] tools/nolibc: add prototypes for non-static functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With -Wmissing-prototypes the compiler will warn about non-static functions which don't have a prototype defined. This warning doesn't make much sense for nolibc itself but for user code it is still useful. To pacify the compiler add prototypes next to the function definitions, similar to how it is handled elsewhere in the kernel. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250123-nolibc-prototype-v1-1-e1afc5c1999a@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/arch-mips.h | 1 + tools/include/nolibc/crt.h | 2 ++ tools/include/nolibc/signal.h | 1 + tools/include/nolibc/stackprotector.h | 2 ++ tools/include/nolibc/stdlib.h | 1 + tools/include/nolibc/string.h | 4 ++++ 6 files changed, 11 insertions(+) diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h index 1791a8ce58da6..753a8ed2cf695 100644 --- a/tools/include/nolibc/arch-mips.h +++ b/tools/include/nolibc/arch-mips.h @@ -179,6 +179,7 @@ }) /* startup code, note that it's called __start on MIPS */ +void __start(void); void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __start(void) { __asm__ volatile ( diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h index bbcd5fd09806b..c4b10103bbec5 100644 --- a/tools/include/nolibc/crt.h +++ b/tools/include/nolibc/crt.h @@ -10,6 +10,7 @@ char **environ __attribute__((weak)); const unsigned long *_auxv __attribute__((weak)); +void _start(void); static void __stack_chk_init(void); static void exit(int); @@ -22,6 +23,7 @@ extern void (*const __init_array_end[])(int, char **, char**) __attribute__((wea extern void (*const __fini_array_start[])(void) __attribute__((weak)); extern void (*const __fini_array_end[])(void) __attribute__((weak)); +void _start_c(long *sp); __attribute__((weak,used)) void _start_c(long *sp) { diff --git a/tools/include/nolibc/signal.h b/tools/include/nolibc/signal.h index 137552216e469..cdcc5904c51e9 100644 --- a/tools/include/nolibc/signal.h +++ b/tools/include/nolibc/signal.h @@ -13,6 +13,7 @@ #include "sys.h" /* This one is not marked static as it's needed by libgcc for divide by zero */ +int raise(int signal); __attribute__((weak,unused,section(".text.nolibc_raise"))) int raise(int signal) { diff --git a/tools/include/nolibc/stackprotector.h b/tools/include/nolibc/stackprotector.h index 1d0d5259ec417..c71a2c257177a 100644 --- a/tools/include/nolibc/stackprotector.h +++ b/tools/include/nolibc/stackprotector.h @@ -18,6 +18,7 @@ * triggering stack protector errors themselves */ +void __stack_chk_fail(void); __attribute__((weak,used,noreturn,section(".text.nolibc_stack_chk"))) void __stack_chk_fail(void) { @@ -28,6 +29,7 @@ void __stack_chk_fail(void) for (;;); } +void __stack_chk_fail_local(void); __attribute__((weak,noreturn,section(".text.nolibc_stack_chk"))) void __stack_chk_fail_local(void) { diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index 75aa273c23a61..86ad378ab1ea2 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -30,6 +30,7 @@ static __attribute__((unused)) char itoa_buffer[21]; */ /* must be exported, as it's used by libgcc for various divide functions */ +void abort(void); __attribute__((weak,unused,noreturn,section(".text.nolibc_abort"))) void abort(void) { diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index 9ec9c24f38c09..ba84ab700e300 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -32,6 +32,7 @@ int memcmp(const void *s1, const void *s2, size_t n) /* might be ignored by the compiler without -ffreestanding, then found as * missing. */ +void *memmove(void *dst, const void *src, size_t len); __attribute__((weak,unused,section(".text.nolibc_memmove"))) void *memmove(void *dst, const void *src, size_t len) { @@ -56,6 +57,7 @@ void *memmove(void *dst, const void *src, size_t len) #ifndef NOLIBC_ARCH_HAS_MEMCPY /* must be exported, as it's used by libgcc on ARM */ +void *memcpy(void *dst, const void *src, size_t len); __attribute__((weak,unused,section(".text.nolibc_memcpy"))) void *memcpy(void *dst, const void *src, size_t len) { @@ -73,6 +75,7 @@ void *memcpy(void *dst, const void *src, size_t len) /* might be ignored by the compiler without -ffreestanding, then found as * missing. */ +void *memset(void *dst, int b, size_t len); __attribute__((weak,unused,section(".text.nolibc_memset"))) void *memset(void *dst, int b, size_t len) { @@ -124,6 +127,7 @@ char *strcpy(char *dst, const char *src) * thus itself, hence the asm() statement below that's meant to disable this * confusing practice. */ +size_t strlen(const char *str); __attribute__((weak,unused,section(".text.nolibc_strlen"))) size_t strlen(const char *str) { -- GitLab From 69ccba67d7cd9d92f313164eb00606b553a6d188 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 23 Jan 2025 21:10:43 +0100 Subject: [PATCH 019/934] selftests/nolibc: ignore -Wmissing-prototypes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To make sure nolibc itself is compatible with -Wmissing-prototypes the compiler flag should be enabled when building nolibc-test. However some of its functions are non-static to ease debugging [0], triggering the compiler warning. Disable the warning inside nolibc-test while still enabling it for nolibc itself. [0] https://lore.kernel.org/lkml/ZMjM0UPRAqoC+goY@1wt.eu/ Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250123-nolibc-prototype-v1-2-e1afc5c1999a@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/nolibc-test.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 0e0e3b48a8c3a..f162793b162f9 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -43,6 +43,8 @@ #endif #endif +#pragma GCC diagnostic ignored "-Wmissing-prototypes" + #include "nolibc-test-linkage.h" /* for the type of int_fast16_t and int_fast32_t, musl differs from glibc and nolibc */ -- GitLab From 4da4e35e9d7ebcf575e02791fa3b1784bd0765a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 23 Jan 2025 21:10:44 +0100 Subject: [PATCH 020/934] selftests/nolibc: enable -Wmissing-prototypes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User code may want to use this compiler flag. Make sure it is supported by nolibc. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250123-nolibc-prototype-v1-3-e1afc5c1999a@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 7d14a7c0cb626..f867ebe3474e0 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -164,7 +164,7 @@ CFLAGS_mips32le = -EL -mabi=32 -fPIC CFLAGS_mips32be = -EB -mabi=32 CFLAGS_STACKPROTECTOR ?= $(call cc-option,-mstack-protector-guard=global $(call cc-option,-fstack-protector-all)) CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wextra \ - $(call cc-option,-fno-stack-protector) \ + $(call cc-option,-fno-stack-protector) $(call cc-option,-Wmissing-prototypes) \ $(CFLAGS_$(XARCH)) $(CFLAGS_STACKPROTECTOR) $(CFLAGS_EXTRA) LDFLAGS := -- GitLab From cfb1bfe9535a74f01a6b9df9fbf1cf0608d13786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 28 Jan 2025 22:22:46 +0100 Subject: [PATCH 021/934] tools/nolibc: make signature of ioctl() more flexible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit POSIX defines the signature of ioctl() as follows, to allow passing a pointer or integer without casting: int ioctl(int fildes, int request, ... /* arg */); Nolibc ioctl() expects a pointer, forcing the user to manually cast. Using va_arg to make the signature more flexible would work but seems to prevent inlining of the function. Instead use a macro. "fd" and "req" will still be typechecked through sys_ioctl(). Acked-by: Willy Tarreau Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/sys.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index d4a5c2399a66b..8c0a55bc9dc3a 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -532,20 +532,16 @@ uid_t getuid(void) /* - * int ioctl(int fd, unsigned long req, void *value); + * int ioctl(int fd, unsigned long cmd, ... arg); */ static __attribute__((unused)) -int sys_ioctl(int fd, unsigned long req, void *value) +long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { - return my_syscall3(__NR_ioctl, fd, req, value); + return my_syscall3(__NR_ioctl, fd, cmd, arg); } -static __attribute__((unused)) -int ioctl(int fd, unsigned long req, void *value) -{ - return __sysret(sys_ioctl(fd, req, value)); -} +#define ioctl(fd, cmd, arg) __sysret(sys_ioctl(fd, cmd, (unsigned long)(arg))) /* * int kill(pid_t pid, int signal); -- GitLab From e16214dc1fbfa7ad6b6d2d8b4dcf05a7b3e5eb45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 23 Jan 2025 08:37:37 +0100 Subject: [PATCH 022/934] selftests/nolibc: drop mips32be EXTRACONFIG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kbuild already contains logic to merge predefines snippets into a defconfig file. For MIPS a snippet for big-endian is already provided. Link: https://lore.kernel.org/r/20250123-nolibc-config-v2-1-5701c35995d6@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index f867ebe3474e0..5e2abc3493ded 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -82,7 +82,7 @@ DEFCONFIG_x86 = defconfig DEFCONFIG_arm64 = defconfig DEFCONFIG_arm = multi_v7_defconfig DEFCONFIG_mips32le = malta_defconfig -DEFCONFIG_mips32be = malta_defconfig +DEFCONFIG_mips32be = malta_defconfig generic/eb.config DEFCONFIG_ppc = pmac32_defconfig DEFCONFIG_ppc64 = powernv_be_defconfig DEFCONFIG_ppc64le = powernv_defconfig @@ -93,7 +93,6 @@ DEFCONFIG_s390 = defconfig DEFCONFIG_loongarch = defconfig DEFCONFIG = $(DEFCONFIG_$(XARCH)) -EXTRACONFIG_mips32be = -d CONFIG_CPU_LITTLE_ENDIAN -e CONFIG_CPU_BIG_ENDIAN EXTRACONFIG = $(EXTRACONFIG_$(XARCH)) # optional tests to run (default = all) -- GitLab From a75b763b51ee259b5c6dd3725b5fe515548917fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 23 Jan 2025 08:37:38 +0100 Subject: [PATCH 023/934] selftests/nolibc: drop call to prepare target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "prepare" target does not need to be run manually. kbuild knows when to use it on its own and the target is not even documented. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250123-nolibc-config-v2-2-5701c35995d6@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 5e2abc3493ded..60e422fc6b07d 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -263,7 +263,7 @@ initramfs: nolibc-test $(Q)cp nolibc-test initramfs/init defconfig: - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) prepare + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) $(Q)if [ -n "$(EXTRACONFIG)" ]; then \ $(srctree)/scripts/config --file $(objtree)/.config $(EXTRACONFIG); \ $(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) olddefconfig < /dev/null; \ -- GitLab From 25d5ef9e7c55fa6f997a53eced9a33a982c9421b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 23 Jan 2025 08:37:39 +0100 Subject: [PATCH 024/934] selftests/nolibc: drop call to mrproper target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "mrproper" unnecessarily cleans a lot of files. kbuild is smart enough to handle changed configurations, so the cleanup is not necessary and only leads to excessive rebuilds. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250123-nolibc-config-v2-3-5701c35995d6@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 60e422fc6b07d..4e075fda9e99d 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -263,7 +263,7 @@ initramfs: nolibc-test $(Q)cp nolibc-test initramfs/init defconfig: - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(DEFCONFIG) $(Q)if [ -n "$(EXTRACONFIG)" ]; then \ $(srctree)/scripts/config --file $(objtree)/.config $(EXTRACONFIG); \ $(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) olddefconfig < /dev/null; \ -- GitLab From d7d271ec30dd02bb9c695233e7fc04a6c728f467 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 23 Jan 2025 08:37:40 +0100 Subject: [PATCH 025/934] selftests/nolibc: execute defconfig before other targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some targets use the test kernel configuration. Executing defconfig in the same make invocation as those targets results in errors as the configuration may be in an inconsistent state during reconfiguration. Avoid this by introducing ordering dependencies between the defconfig and some other targets. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250123-nolibc-config-v2-4-5701c35995d6@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 4e075fda9e99d..983985b7529b6 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -219,7 +219,7 @@ all: run sysroot: sysroot/$(ARCH)/include -sysroot/$(ARCH)/include: +sysroot/$(ARCH)/include: | defconfig $(Q)rm -rf sysroot/$(ARCH) sysroot/sysroot $(QUIET_MKDIR)mkdir -p sysroot $(Q)$(MAKE) -C $(srctree) outputmakefile @@ -269,10 +269,10 @@ defconfig: $(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) olddefconfig < /dev/null; \ fi -kernel: +kernel: | defconfig $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null -kernel-standalone: initramfs +kernel-standalone: initramfs | defconfig $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs < /dev/null # run the tests after building the kernel -- GitLab From c1f4a7a84037249d086a4114c0c4332a260e9091 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 23 Jan 2025 08:37:41 +0100 Subject: [PATCH 026/934] selftests/nolibc: always keep test kernel configuration up to date MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid using a stale test kernel configuration by always synchronizing it to the current source tree. kbuild is smart enough to avoid spurious rebuilds. Shuffle the code around a bit to keep all the commands with side-effects together. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20250123-nolibc-config-v2-5-5701c35995d6@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/run-tests.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index 9c5160c538812..bc4e92b4f1b98 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -157,10 +157,6 @@ test_arch() { fi MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" LLVM="${llvm}" O="${build_dir}") - mkdir -p "$build_dir" - if [ "$test_mode" = "system" ] && [ ! -f "${build_dir}/.config" ]; then - swallow_output "${MAKE[@]}" defconfig - fi case "$test_mode" in 'system') test_target=run @@ -173,6 +169,9 @@ test_arch() { exit 1 esac printf '%-15s' "$arch:" + + mkdir -p "$build_dir" + swallow_output "${MAKE[@]}" defconfig swallow_output "${MAKE[@]}" CFLAGS_EXTRA="$CFLAGS_EXTRA" "$test_target" V=1 cp run.out run.out."${arch}" "${MAKE[@]}" report | grep passed -- GitLab From b944249bcea97f2f6229852ae3f05f7acdcb0681 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 29 Jan 2025 17:57:59 +0100 Subject: [PATCH 027/934] fsnotify: add mount notification infrastructure This is just the plumbing between the event source (fs/namespace.c) and the event consumer (fanotify). In itself it does nothing. Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20250129165803.72138-2-mszeredi@redhat.com Signed-off-by: Christian Brauner --- fs/mount.h | 4 +++ fs/notify/fsnotify.c | 47 +++++++++++++++++++++++++++----- fs/notify/fsnotify.h | 11 ++++++++ fs/notify/mark.c | 14 ++++++++-- include/linux/fsnotify.h | 20 ++++++++++++++ include/linux/fsnotify_backend.h | 42 ++++++++++++++++++++++++++++ 6 files changed, 128 insertions(+), 10 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index ffb613cdfeee9..82aa3bad7cf5a 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -21,6 +21,10 @@ struct mnt_namespace { struct rcu_head mnt_ns_rcu; }; u64 event; +#ifdef CONFIG_FSNOTIFY + __u32 n_fsnotify_mask; + struct fsnotify_mark_connector __rcu *n_fsnotify_marks; +#endif unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 8ee495a58d0ad..c64b95cf50c71 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -28,6 +28,11 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) fsnotify_clear_marks_by_mount(mnt); } +void __fsnotify_mntns_delete(struct mnt_namespace *mntns) +{ + fsnotify_clear_marks_by_mntns(mntns); +} + /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @sb: superblock being unmounted. @@ -420,7 +425,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type, file_name, cookie, iter_info); } -static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp) +static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp) { struct fsnotify_mark_connector *conn; struct hlist_node *node = NULL; @@ -538,14 +543,15 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); - struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); + struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL; struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; struct dentry *moved; int inode2_type; int ret = 0; - __u32 test_mask, marks_mask; + __u32 test_mask, marks_mask = 0; if (path) mnt = real_mount(path->mnt); @@ -578,17 +584,20 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && - (!inode2 || !inode2->i_fsnotify_marks)) + (!inode2 || !inode2->i_fsnotify_marks) && + (!mnt_data || !mnt_data->ns->n_fsnotify_marks)) return 0; - marks_mask = READ_ONCE(sb->s_fsnotify_mask); + if (sb) + marks_mask |= READ_ONCE(sb->s_fsnotify_mask); if (mnt) marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask); if (inode) marks_mask |= READ_ONCE(inode->i_fsnotify_mask); if (inode2) marks_mask |= READ_ONCE(inode2->i_fsnotify_mask); - + if (mnt_data) + marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask); /* * If this is a modify event we may need to clear some ignore masks. @@ -618,6 +627,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, iter_info.marks[inode2_type] = fsnotify_first_mark(&inode2->i_fsnotify_marks); } + if (mnt_data) { + iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] = + fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks); + } /* * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark @@ -702,11 +715,31 @@ void file_set_fsnotify_mode(struct file *file) } #endif +void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) +{ + struct fsnotify_mnt data = { + .ns = ns, + .mnt_id = real_mount(mnt)->mnt_id_unique, + }; + + if (WARN_ON_ONCE(!ns)) + return; + + /* + * This is an optimization as well as making sure fsnotify_init() has + * been called. + */ + if (!ns->n_fsnotify_marks) + return; + + fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0); +} + static __init int fsnotify_init(void) { int ret; - BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 663759ed6fbc1..5950c7a67f414 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -33,6 +33,12 @@ static inline struct super_block *fsnotify_conn_sb( return conn->obj; } +static inline struct mnt_namespace *fsnotify_conn_mntns( + struct fsnotify_mark_connector *conn) +{ + return conn->obj; +} + static inline struct super_block *fsnotify_object_sb(void *obj, enum fsnotify_obj_type obj_type) { @@ -89,6 +95,11 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) fsnotify_destroy_marks(fsnotify_sb_marks(sb)); } +static inline void fsnotify_clear_marks_by_mntns(struct mnt_namespace *mntns) +{ + fsnotify_destroy_marks(&mntns->n_fsnotify_marks); +} + /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 4981439e62092..798340db69d76 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -107,6 +107,8 @@ static fsnotify_connp_t *fsnotify_object_connp(void *obj, return &real_mount(obj)->mnt_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_SB: return fsnotify_sb_marks(obj); + case FSNOTIFY_OBJ_TYPE_MNTNS: + return &((struct mnt_namespace *)obj)->n_fsnotify_marks; default: return NULL; } @@ -120,6 +122,8 @@ static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) return &fsnotify_conn_sb(conn)->s_fsnotify_mask; + else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) + return &fsnotify_conn_mntns(conn)->n_fsnotify_mask; return NULL; } @@ -346,12 +350,15 @@ static void *fsnotify_detach_connector_from_object( fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; + } else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) { + fsnotify_conn_mntns(conn)->n_fsnotify_mask = 0; } rcu_assign_pointer(*connp, NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; - fsnotify_update_sb_watchers(sb, conn); + if (sb) + fsnotify_update_sb_watchers(sb, conn); return inode; } @@ -724,7 +731,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, * Attach the sb info before attaching a connector to any object on sb. * The sb info will remain attached as long as sb lives. */ - if (!fsnotify_sb_info(sb)) { + if (sb && !fsnotify_sb_info(sb)) { err = fsnotify_attach_info_to_sb(sb); if (err) return err; @@ -770,7 +777,8 @@ restart: /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: - fsnotify_update_sb_watchers(sb, conn); + if (sb) + fsnotify_update_sb_watchers(sb, conn); /* * Since connector is attached to object using cmpxchg() we are * guaranteed that connector initialization is fully visible by anyone diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 1a9ef8f6784dd..589e274adc7d9 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -299,6 +299,11 @@ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt) __fsnotify_vfsmount_delete(mnt); } +static inline void fsnotify_mntns_delete(struct mnt_namespace *mntns) +{ + __fsnotify_mntns_delete(mntns); +} + /* * fsnotify_inoderemove - an inode is going away */ @@ -507,4 +512,19 @@ static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode, NULL, NULL, NULL, 0); } +static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt) +{ + fsnotify_mnt(FS_MNT_ATTACH, ns, mnt); +} + +static inline void fsnotify_mnt_detach(struct mnt_namespace *ns, struct vfsmount *mnt) +{ + fsnotify_mnt(FS_MNT_DETACH, ns, mnt); +} + +static inline void fsnotify_mnt_move(struct mnt_namespace *ns, struct vfsmount *mnt) +{ + fsnotify_mnt(FS_MNT_MOVE, ns, mnt); +} + #endif /* _LINUX_FS_NOTIFY_H */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0d24a21a8e60a..6cd8d1d28b8be 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -59,6 +59,10 @@ #define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */ +#define FS_MNT_ATTACH 0x01000000 /* Mount was attached */ +#define FS_MNT_DETACH 0x02000000 /* Mount was detached */ +#define FS_MNT_MOVE (FS_MNT_ATTACH | FS_MNT_DETACH) + /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. @@ -80,6 +84,9 @@ */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) +/* Mount namespace events */ +#define FSNOTIFY_MNT_EVENTS (FS_MNT_ATTACH | FS_MNT_DETACH) + /* Content events can be used to inspect file content */ #define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \ FS_ACCESS_PERM) @@ -108,6 +115,7 @@ /* Events that can be reported to backends */ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ + FSNOTIFY_MNT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | \ FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \ @@ -298,6 +306,7 @@ enum fsnotify_data_type { FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, + FSNOTIFY_EVENT_MNT, FSNOTIFY_EVENT_ERROR, }; @@ -318,6 +327,11 @@ static inline const struct path *file_range_path(const struct file_range *range) return range->path; } +struct fsnotify_mnt { + const struct mnt_namespace *ns; + u64 mnt_id; +}; + static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { @@ -383,6 +397,24 @@ static inline struct super_block *fsnotify_data_sb(const void *data, } } +static inline const struct fsnotify_mnt *fsnotify_data_mnt(const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_MNT: + return data; + default: + return NULL; + } +} + +static inline u64 fsnotify_data_mnt_id(const void *data, int data_type) +{ + const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); + + return mnt_data ? mnt_data->mnt_id : 0; +} + static inline struct fs_error_report *fsnotify_data_error_report( const void *data, int data_type) @@ -420,6 +452,7 @@ enum fsnotify_iter_type { FSNOTIFY_ITER_TYPE_SB, FSNOTIFY_ITER_TYPE_PARENT, FSNOTIFY_ITER_TYPE_INODE2, + FSNOTIFY_ITER_TYPE_MNTNS, FSNOTIFY_ITER_TYPE_COUNT }; @@ -429,6 +462,7 @@ enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_VFSMOUNT, FSNOTIFY_OBJ_TYPE_SB, + FSNOTIFY_OBJ_TYPE_MNTNS, FSNOTIFY_OBJ_TYPE_COUNT, FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT }; @@ -613,8 +647,10 @@ extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data extern void __fsnotify_inode_delete(struct inode *inode); extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt); extern void fsnotify_sb_delete(struct super_block *sb); +extern void __fsnotify_mntns_delete(struct mnt_namespace *mntns); extern void fsnotify_sb_free(struct super_block *sb); extern u32 fsnotify_get_cookie(void); +extern void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt); static inline __u32 fsnotify_parent_needed_mask(__u32 mask) { @@ -928,6 +964,9 @@ static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt) static inline void fsnotify_sb_delete(struct super_block *sb) {} +static inline void __fsnotify_mntns_delete(struct mnt_namespace *mntns) +{} + static inline void fsnotify_sb_free(struct super_block *sb) {} @@ -942,6 +981,9 @@ static inline u32 fsnotify_get_cookie(void) static inline void fsnotify_unmount_inodes(struct super_block *sb) {} +static inline void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) +{} + #endif /* CONFIG_FSNOTIFY */ #endif /* __KERNEL __ */ -- GitLab From a9fe4f04da521a626f3a5c65c67eeac3adec11be Mon Sep 17 00:00:00 2001 From: I Hsin Cheng Date: Tue, 4 Feb 2025 17:33:26 +0800 Subject: [PATCH 028/934] mm: pgtable: Fix grammar error Fix "due high contention" to "due to high contention". Signed-off-by: I Hsin Cheng Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20250204093326.206007-1-richard120310@gmail.com --- Documentation/mm/split_page_table_lock.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst index 8e1ceb0a6619a..cc3cd46abd1b6 100644 --- a/Documentation/mm/split_page_table_lock.rst +++ b/Documentation/mm/split_page_table_lock.rst @@ -4,7 +4,7 @@ Split page table lock Originally, mm->page_table_lock spinlock protected all page tables of the mm_struct. But this approach leads to poor page fault scalability of -multi-threaded applications due high contention on the lock. To improve +multi-threaded applications due to high contention on the lock. To improve scalability, split page table lock was introduced. With split page table lock we have separate per-table lock to serialize -- GitLab From 512ca748e8f5509766159a7005f46617ba6b37ed Mon Sep 17 00:00:00 2001 From: Shuo Zhao Date: Thu, 23 Jan 2025 13:17:39 +0800 Subject: [PATCH 029/934] docs/zh_CN: Add tpm index Chinese translation Translate .../security/tpm/index.rst into Chinese Update the translation through commit 1d479e3cd652 ("Documentation: tpm: Add TPM security docs toctree entry") Reviewed-by: Alex Shi Signed-off-by: Shuo Zhao Reviewed-by: Yanteng Si Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/900d91e8a2c7c35259005de5cff99e7bb4e7adf6.1737603330.git.zhaoshuo@cqsoftware.com.cn --- .../translations/zh_CN/security/index.rst | 2 +- .../translations/zh_CN/security/tpm/index.rst | 22 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 Documentation/translations/zh_CN/security/tpm/index.rst diff --git a/Documentation/translations/zh_CN/security/index.rst b/Documentation/translations/zh_CN/security/index.rst index d8aacd1930d99..949794b5eec1d 100644 --- a/Documentation/translations/zh_CN/security/index.rst +++ b/Documentation/translations/zh_CN/security/index.rst @@ -18,6 +18,7 @@ lsm sak siphash + tpm/index digsig landlock @@ -29,6 +30,5 @@ TODOLIST: * lsm-development * SCTP * self-protection -* tpm/index * secrets/index * ipe diff --git a/Documentation/translations/zh_CN/security/tpm/index.rst b/Documentation/translations/zh_CN/security/tpm/index.rst new file mode 100644 index 0000000000000..a55d0a7bbc0fd --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/index.rst @@ -0,0 +1,22 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/index.rst + +:翻译: + 赵硕 Shuo Zhao + +================ +可信平台模块文档 +================ + +.. toctree:: + + +TODOLIST: +* tpm_event_log +* tpm-security +* tpm_tis +* tpm_vtpm_proxy +* xen-tpmfront +* tpm_ftpm_tee -- GitLab From 4ad1ba0358b1b342a8233e6d70dd4f7bc5e1f152 Mon Sep 17 00:00:00 2001 From: Shuo Zhao Date: Thu, 23 Jan 2025 13:17:40 +0800 Subject: [PATCH 030/934] docs/zh_CN: Add tpm tpm_event_log Chinese translation Translate .../security/tpm/tpm_event_log.rst into Chinese. Update the translation through commit 2ef5a7f1482c ("tpm: Document UEFI event log quirks") Reviewed-by: Alex Shi Signed-off-by: Shuo Zhao Reviewed-by: Yanteng Si Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/32c35c0504f262668996080b44dfe059a5266256.1737603330.git.zhaoshuo@cqsoftware.com.cn --- .../translations/zh_CN/security/tpm/index.rst | 2 +- .../zh_CN/security/tpm/tpm_event_log.rst | 49 +++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 Documentation/translations/zh_CN/security/tpm/tpm_event_log.rst diff --git a/Documentation/translations/zh_CN/security/tpm/index.rst b/Documentation/translations/zh_CN/security/tpm/index.rst index a55d0a7bbc0fd..f324bd4dd0931 100644 --- a/Documentation/translations/zh_CN/security/tpm/index.rst +++ b/Documentation/translations/zh_CN/security/tpm/index.rst @@ -12,9 +12,9 @@ .. toctree:: + tpm_event_log TODOLIST: -* tpm_event_log * tpm-security * tpm_tis * tpm_vtpm_proxy diff --git a/Documentation/translations/zh_CN/security/tpm/tpm_event_log.rst b/Documentation/translations/zh_CN/security/tpm/tpm_event_log.rst new file mode 100644 index 0000000000000..9c173291ac3ee --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/tpm_event_log.rst @@ -0,0 +1,49 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/tpm_event_log.rst + +:翻译: + 赵硕 Shuo Zhao + +=========== +TPM事件日志 +=========== + +本文档简要介绍了什么是TPM日志,以及它是如何从预启动固件移交到操作系统的。 + +介绍 +==== + +预启动固件维护一个事件日志,每当它将某些内容哈希到任何一个PCR寄存器时,该 +日志会添加新条目。这些事件按类型分类,并包含哈希后的PCR寄存器值。通常,预 +启动固件会哈希那些即将移交执行权或与启动过程相关的组件。 + +其主要应用是远程认证,而它之所以有用的原因在[1]中第一部分很好地阐述了: + +认证用于向挑战者提供有关平台状态的信息。然而,PCR的内容难以解读;因此,当 +PCR内容附有测量日志时,认证通常会更有用。尽管测量日志本身并不可信,但它们 +包含比PCR内容更为丰富的信息集。PCR内容用于对测量日志进行验证。 + +UEFI事件日志 +============ + +UEFI提供的事件日志有一些比较奇怪的特性。 + +在调用ExitBootServices()之前,Linux EFI引导加载程序会将事件日志复制到由 +引导加载程序自定义的配置表中。不幸的是,通过ExitBootServices()生成的事件 +并不会出现在这个表里。 + +固件提供了一个所谓的最终事件配置表排序来解决这个问题。事件会在第一次调用 +EFI_TCG2_PROTOCOL.GetEventLog()后被镜像到这个表中。 + +这引出了另一个问题:无法保证它不会在 Linux EFI stub 开始运行之前被调用。 +因此,在 stub 运行时,它需要计算并将最终事件表的大小保存到自定义配置表中, +以便TPM驱动程序可以在稍后连接来自自定义配置表和最终事件表的两个事件日志时 +跳过这些事件。 + +参考文献 +======== + +- [1] https://trustedcomputinggroup.org/resource/pc-client-specific-platform-firmware-profile-specification/ +- [2] The final concatenation is done in drivers/char/tpm/eventlog/efi.c -- GitLab From ece0788d589117c34cfd22fc07467c155aa78a89 Mon Sep 17 00:00:00 2001 From: Shuo Zhao Date: Thu, 23 Jan 2025 13:17:41 +0800 Subject: [PATCH 031/934] docs/zh_CN: Add tpm tpm-security Chinese translation Translate .../security/tpm/tpm-security.rst into Chinese Update the translation through commit 3d2daf9d592e ("Documentation: add tpm-security.rst") Signed-off-by: Shuo Zhao Reviewed-by: Yanteng Si Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/a7d98aaf8bc1393fb38095a0d2a3bc3e43c1c543.1737603330.git.zhaoshuo@cqsoftware.com.cn --- .../translations/zh_CN/security/tpm/index.rst | 2 +- .../zh_CN/security/tpm/tpm-security.rst | 151 ++++++++++++++++++ 2 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 Documentation/translations/zh_CN/security/tpm/tpm-security.rst diff --git a/Documentation/translations/zh_CN/security/tpm/index.rst b/Documentation/translations/zh_CN/security/tpm/index.rst index f324bd4dd0931..46cdf7c1a1c16 100644 --- a/Documentation/translations/zh_CN/security/tpm/index.rst +++ b/Documentation/translations/zh_CN/security/tpm/index.rst @@ -13,9 +13,9 @@ .. toctree:: tpm_event_log + tpm-security TODOLIST: -* tpm-security * tpm_tis * tpm_vtpm_proxy * xen-tpmfront diff --git a/Documentation/translations/zh_CN/security/tpm/tpm-security.rst b/Documentation/translations/zh_CN/security/tpm/tpm-security.rst new file mode 100644 index 0000000000000..26818d28c98f1 --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/tpm-security.rst @@ -0,0 +1,151 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/tpm-security.rst + +:翻译: + 赵硕 Shuo Zhao + +TPM安全 +======= + +本文档的目的是描述我们如何使内核使用TPM在面对外部窥探和数据包篡改 +攻击(文献中称为被动和主动中间人攻击)时保持合理的稳健性。当前的 +安全文档适用于TPM2.0。 + +介绍 +---- + +TPM通常是一个通过某种低带宽总线连接到PC的独立芯片。虽然有一些 +例外,例如Intel PTT,它是运行在靠近CPU的软件环境中的软件TPM, +容易受到不同类型的攻击,但目前大多数强化的安全环境要求使用独立 +硬件TPM,这是本文讨论的使用场景。 + +总线上的窥探和篡改攻击 +---------------------- + +当前的技术状态允许使用 `TPM Genie`_ 硬件中间人,这是一种简单的外部设备,可以在 +任何系统或笔记本电脑上几秒钟内安装。最近成功演示了针对 `Windows Bitlocker TPM`_ +系统的攻击。最近同样的攻击针对 `基于TPM的Linux磁盘加密`_ 方案也遭到了同样的攻击。 +下一阶段的研究似乎是入侵总线上现有的设备以充当中间人,因此攻击者需要物理访问几 +秒钟的要求可能不再存在。然而,本文档的目标是尽可能在这种环境下保护TPM的机密性和 +完整性,并尝试确保即使我们不能防止攻击,至少可以检测到它。 + +不幸的是,大多数TPM功能,包括硬件重置功能,都能被能够访问总线的攻击 +者控制,因此下面我们将讨论一些可能出现的干扰情况。 + +测量(PCR)完整性 +----------------- + +由于攻击者可以向TPM发送自己的命令,他们可以发送任意的PCR扩展,从而破 +坏测量系统,这将是一种烦人的拒绝服务攻击。然而,针对密封到信任测量中 +的实体,有两类更严重的攻击。 + +1. 攻击者可以拦截来自系统的所有PCR扩展,并完全替换为自己的值,产生 + 一个未篡改状态的重现,这会使PCR测量证明状态是可信的,并释放密钥。 + +2. 攻击者可能会在某个时刻重置TPM,清除PCR,然后发送自己的测量,从而 + 有效地覆盖TPM已经完成的启动时间测量。 + +第一种攻击可以通过始终对PCR扩展和读取命令进行HMAC保护来防止,这意味着 +如果没有在响应中产生可检测的HMAC失败,则测量值无法被替换。然而第二种 +攻击只能通过依赖某种机制来检测,这种机制会在TPM重置后发生变化。 + +秘密保护 +-------- + +某些进出TPM的信息,如密钥密封、私钥导入和随机数生成容易被拦截,而仅仅 +使用HMAC保护无法防止这种情况。因此,对于这些类型的命令,我们必须使用 +请求和响应加密来防止秘密信息的泄露。 + +与TPM建立初始信任 +----------------- + +为了从一开始就提供安全性,必须建立一个初始的共享或非对称秘密,并且该 +秘钥必须对攻击者不可知。最明显的途径是使用背书和存储种子,这些可以用 +来派生非对称密钥。然而,使用这些密钥很困难,因为将它们传递给内核的唯 +一方法是通过命令行,这需要在启动系统中进行广泛的支持,而且无法保证任 +何一个层次不会有任何形式的授权。 + +Linux内核选择的机制是从空种子使用标准的存储种子参数派生出主椭圆曲线 +密钥。空种子有两个优势:首先该层次物理上无法具有授权,因此我们始终可 +以使用它;其次空种子在TPM重置时会发生变化,这意味着如果我们在当天开始 +时基于空种子建立信任,如果TPM重置且种子变化,则所有派生的密钥进行加盐 +处理的会话都将失败。 + +显然,在没有任何其他共享秘密的情况下使用空种子,我们必须创建并读取初始 +公钥,这当然可能会被总线中间人拦截并替换。然而,TPM有一个密钥认证机制 +(使用EK背书证书,创建认证身份密钥,并用该密钥认证空种子主密钥),但由 +于它过于复杂,无法在内核中运行,因此我们保留空主密钥名称的副本,通过 +sysfs导出,以便用户空间在启动时进行完整的认证。这里的明确保证是,如果空 +主密钥认证成功,那么从当天开始的所有TPM交易都是安全的;如果认证失败,则 +说明系统上有中间人(并且任何在启动期间使用的秘密可能已被泄露)。 + +信任堆叠 +-------- + +在当前的空主密钥场景中,TPM必须在交给下一个使用者之前完全清除。然而, +内核将派生出的空种子密钥的名称传递给用户空间,用户空间可以通过认证来 +验证该名称。因此,这种名称传递链也可以用于各个启动组件之间(通过未指 +定的机制)。例如grub可以使用空种子方案来实现安全性,并将名称交给内核。 +内核可以派生出密钥和名称,并确定如果它们与交接的版本不同,则表示发生 +了篡改。因此可以通过名称传递将任意启动组件(从UEFI到grub到内核)串联 +起来,只要每个后续组件知道如何收集该名称,并根据其派生的密钥进行验证。 + +会话属性 +-------- + +所有内核使用的TPM命令都允许会话。HMAC会话可用于检查请求和响应的完整性, +而解密和加密标志可用于保护参数和响应。HMAC和加密密钥通常是从共享授权秘 +钥推导出来的,但对于许多内核操作来说,这些密钥是已知的(通常为空)。因 +此内核使用空主密钥作为盐密钥来创建每个HMAC会话,这样就为会话密钥的派生 +提供了加密输入。因此内核仅需创建一次空主密钥(作为一个易失的TPM句柄), +并将其保存在tpm_chip中,用于每次在内核中使用TPM时。由于内核资源管理器缺 +乏去间隙化,当前每次操作都需要创建和销毁会话,但未来可能会将单个会话重用 +用于内核中的HMAC、加密和解密会话。 + +保护类型 +-------- + +对于每个内核操作,我们使用空主密钥加盐的HMAC来保护完整性。此外我们使用参数 +加密来保护密钥密封,并使用参数解密来保护密钥解封和随机数生成。 + +空主密钥认证在用户空间的实现 +============================ + +每个TPM都会附带几个X.509证书,通常用于主背书密钥。本文档假设存在椭圆曲线 +版本的证书,位于01C00002,但也同样适用于RSA证书(位于01C00001)。 + +认证的第一步是使用 `TCG EK Credential Profile`_ 模板进行主密钥的创建,该 +模板允许将生成的主密钥与证书中的主密钥进行比较(公钥必须匹配)。需要注意 +的是,生成EK主密钥需要EK层级密码,但EC主密钥的预生成版本应位于81010002, +并且可以无需密钥授权对其执行TPM2_ReadPublic()操作。接下来,证书本身必须 +经过验证,以确保其可以追溯到制造商根证书(该根证书应公开在制造商网站上)。 +完成此步骤后将在TPM内部生成一个认证密钥(AK),并使用TPM2_MakeCredential、 +AK的名称和EK公钥加密一个秘密。然后TPM执行TPM2_ActivateCredential,只有在 +TPM、EK和AK之间的绑定关系成立时,才能恢复秘密。现在,生成的AK可以用于对由 +内核导出的空主密钥进行认证。由于TPM2_MakeCredential/ActivateCredential操作 +相对复杂,下面将描述一种涉及外部生成私钥的简化过程。 + +这个过程是通常基于隐私CA认证过程的简化缩写。假设此时认证由TPM所有者进行, +所有者只能访问所有者层次。所有者创建一个外部公/私钥对(假设是椭圆曲线), +并使用内部包装过程将私钥进行封装以便导入,该私钥被其父级由EC派生的存储主密 +钥保护。TPM2_Import()操作使用一个以EK主密钥为盐值的参数解密HMAC会话(这也不 +需要EK密钥授权),意味着内部封装密钥是加密参数,因此除非TPM拥有认证的EK,否 +则无法执行导入操作。如果该命令成功执行并且HMAC在返回时通过验证,我们就知道 +我们有一个只为认证TPM加载的私钥副本。现在该密钥已加载到TPM中,并且存储主密 +钥已被清除(以释放空间用于生成空密钥)。 + +现在根据 `TCG TPM v2.0 Provisioning Guidance`_ 中的存储配置生成空EC主密钥; +该密钥的名称(即公钥区域的哈希值)被计算出来并与内核在/sys/class/tpm/tpm0/null_name +中提供的空种子名称进行比较。如果名称不匹配,TPM就被认为是受损的。如果名称匹配, +用户执行TPM2_Certify(),使用空主密钥作为对象句柄,使用加载的私钥作为签名句柄, +并提供随机的合格数据。返回的certifyInfo的签名将与加载的私钥的公钥部分进行验证, +并检查合格数据以防止重放。如果所有测试都通过,用户就可以确信TPM的完整性和隐私 +性在整个内核启动过程中得到了保护。 + +.. _TPM Genie: https://www.nccgroup.trust/globalassets/about-us/us/documents/tpm-genie.pdf +.. _Windows Bitlocker TPM: https://dolosgroup.io/blog/2021/7/9/from-stolen-laptop-to-inside-the-company-network +.. _基于TPM的Linux磁盘加密: https://www.secura.com/blog/tpm-sniffing-attacks-against-non-bitlocker-targets +.. _TCG EK Credential Profile: https://trustedcomputinggroup.org/resource/tcg-ek-credential-profile-for-tpm-family-2-0/ +.. _TCG TPM v2.0 Provisioning Guidance: https://trustedcomputinggroup.org/resource/tcg-tpm-v2-0-provisioning-guidance/ -- GitLab From 98526e6969336cd42086963b0d2037b3d5dc0279 Mon Sep 17 00:00:00 2001 From: Shuo Zhao Date: Thu, 23 Jan 2025 13:17:42 +0800 Subject: [PATCH 032/934] docs/zh_CN: Add tpm tpm_tis Chinese translation Translate .../security/tpm/tpm_tis.rst into Chinese. Update the translation through commit 8a55256a8462 ("Documentation: tpm_tis") Reviewed-by: Alex Shi Signed-off-by: Shuo Zhao Reviewed-by: Yanteng Si Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/ac55092bc3f1b3ec51f3e2dd596616ade1a32076.1737603330.git.zhaoshuo@cqsoftware.com.cn --- .../translations/zh_CN/security/tpm/index.rst | 2 +- .../zh_CN/security/tpm/tpm_tis.rst | 43 +++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 Documentation/translations/zh_CN/security/tpm/tpm_tis.rst diff --git a/Documentation/translations/zh_CN/security/tpm/index.rst b/Documentation/translations/zh_CN/security/tpm/index.rst index 46cdf7c1a1c16..4ee8881a41a48 100644 --- a/Documentation/translations/zh_CN/security/tpm/index.rst +++ b/Documentation/translations/zh_CN/security/tpm/index.rst @@ -14,9 +14,9 @@ tpm_event_log tpm-security + tpm_tis TODOLIST: -* tpm_tis * tpm_vtpm_proxy * xen-tpmfront * tpm_ftpm_tee diff --git a/Documentation/translations/zh_CN/security/tpm/tpm_tis.rst b/Documentation/translations/zh_CN/security/tpm/tpm_tis.rst new file mode 100644 index 0000000000000..0fb009f93e107 --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/tpm_tis.rst @@ -0,0 +1,43 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/tpm_tis.rst + +:翻译: + 赵硕 Shuo Zhao + +TPM FIFO接口驱动 +================ + +TCG PTP规范定义了两种接口类型:FIFO和CRB。前者基于顺序的读写操作, +后者基于包含完整命令或响应的缓冲区。 + +FIFO(先进先出)接口被tpm_tis_core依赖的驱动程序使用。最初,Linux只 +有一个名为tpm_tis的驱动,覆盖了内存映射(即 MMIO)接口,但后来它被 +扩展以支持TCG标准所支持的其他物理接口。 + +由于历史原因,最初的MMIO驱动被称为tpm_tis,而FIFO驱动的框架被命名为 +tpm_tis_core。在tpm_tis中的“tis”后缀来自TPM接口规范,这是针对TPM1.x +芯片的硬件接口规范。 + +通信基于一个由TPM芯片通过硬件总线或内存映射共享的20KiB 缓冲区,具体 +取决于物理接线。该缓冲区进一步分为五个相等大小的4KiB缓冲区,为CPU和 +TPM之间的通信提供等效的寄存器集。这些通信端点在TCG术语中称为localities。 + +当内核想要向TPM芯片发送命令时,它首先通过在TPM_ACCESS寄存器中设置 +requestUse位来保留locality0。当访问被授予时,该位由芯片清除。一旦完成 +通信,内核会写入TPM_ACCESS.activeLocality位。这告诉芯片该本地性已被 +释放。 + +待处理的本地性由芯片按优先级降序逐个服务,一次一个: + +- Locality0优先级最低。 +- Locality5优先级最高。 + +关于localities的更多信息和含义,请参阅TCG PC客户端平台TPM 配置文件规范的第3.2节。 + +参考文献 +======== + +TCG PC客户端平台TPM配置文件(PTP)规范 +https://trustedcomputinggroup.org/resource/pc-client-platform-tpm-profile-ptp-specification/ -- GitLab From f7824b6917072306621aea8d3b51478de87c4e67 Mon Sep 17 00:00:00 2001 From: Shuo Zhao Date: Thu, 23 Jan 2025 13:17:43 +0800 Subject: [PATCH 033/934] docs/zh_CN: Add tpm tpm_vtpm_proxy Chinese translation Translate .../security/tpm/tpm_vtpm_proxy.rst into Chinese. Update the translation through commit 799a545bb938 ("tpm: move documentation under Documentation/security") Signed-off-by: Shuo Zhao Reviewed-by: Yanteng Si Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/f9798eaec76b27cc02fa47970bf623879377d422.1737603330.git.zhaoshuo@cqsoftware.com.cn --- .../translations/zh_CN/security/tpm/index.rst | 2 +- .../zh_CN/security/tpm/tpm_vtpm_proxy.rst | 51 +++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 Documentation/translations/zh_CN/security/tpm/tpm_vtpm_proxy.rst diff --git a/Documentation/translations/zh_CN/security/tpm/index.rst b/Documentation/translations/zh_CN/security/tpm/index.rst index 4ee8881a41a48..dd80816f8af88 100644 --- a/Documentation/translations/zh_CN/security/tpm/index.rst +++ b/Documentation/translations/zh_CN/security/tpm/index.rst @@ -15,8 +15,8 @@ tpm_event_log tpm-security tpm_tis + tpm_vtpm_proxy TODOLIST: -* tpm_vtpm_proxy * xen-tpmfront * tpm_ftpm_tee diff --git a/Documentation/translations/zh_CN/security/tpm/tpm_vtpm_proxy.rst b/Documentation/translations/zh_CN/security/tpm/tpm_vtpm_proxy.rst new file mode 100644 index 0000000000000..bc92cfb684c36 --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/tpm_vtpm_proxy.rst @@ -0,0 +1,51 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/tpm_vtpm_proxy.rst + +:翻译: + 赵硕 Shuo Zhao + +========================== +Linux容器的虚拟TPM代理驱动 +========================== + +| 作者: +| Stefan Berger + +本文档描述了用于Linux容器的虚拟可信平台模块(vTPM)代理设备驱动。 + +介绍 +==== + +这项工作的目标是为每个Linux容器提供TPM功能。这使得程序能够像与物理系统 +上的TPM交互一样,与容器中的TPM进行交互。每个容器都会获得一个唯一的、模 +拟的软件TPM。 + +设计 +==== + +为了使每个容器都能使用模拟的软件TPM,容器管理栈需要创建一对设备,其中 +包括一个客户端TPM字符设备 ``/dev/tpmX`` (X=0,1,2...)和一个‘服务器端’ +文件描述符。当文件描述符传被递给TPM模拟器时,前者通过创建具有适当主次 +设备号的字符设备被移入容器,然后,容器内的软件可以使用字符设备发送TPM +命令,模拟器将通过文件描述符接收这些命令,并用它来发送响应。 + +为了支持这一点,虚拟TPM代理驱动程序提供了一个设备 ``/dev/vtpmx`` ,该设备 +用于通过ioctl创建设备对。ioctl将其作为配置设备的输入标志,例如这些标志指示 +TPM模拟器是否支持TPM1.2或TPM2功能。ioctl的结果是返回‘服务器端’的文件描述符 +以及创建的字符设备的主次设备号。此外,还会返回TPM字符设备的编号。例如,如果 +创建了 ``/dev/tpm10`` ,则返回编号( ``dev_num`` )10。 + +一旦设备被创建,驱动程序将立即尝试与TPM进行通信。来自驱动程序的所有命令 +都可以从ioctl返回的文件描述符中读取。这些命令应该立即得到响应。 + +UAPI +==== + +该API在以下内核代码中: + +include/uapi/linux/vtpm_proxy.h +drivers/char/tpm/tpm_vtpm_proxy.c + +函数:vtpmx_ioc_new_dev -- GitLab From 22ab45a82136bd8d5abd2a66951f58043351f461 Mon Sep 17 00:00:00 2001 From: Shuo Zhao Date: Thu, 23 Jan 2025 13:17:44 +0800 Subject: [PATCH 034/934] docs/zh_CN: Add tpm xen-tpmfront Chinese translation Translate .../security/tpm/xen-tpmfront.rst into Chinese. Update the translation through commit 9e255e2b9afe ("Documentation: drop optional BOMs") Reviewed-by: Alex Shi Signed-off-by: Shuo Zhao Reviewed-by: Yanteng Si Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/570a7a7c6f55996c02dd2e474a4e8cbfa8f9ccc3.1737603330.git.zhaoshuo@cqsoftware.com.cn --- .../translations/zh_CN/security/tpm/index.rst | 2 +- .../zh_CN/security/tpm/xen-tpmfront.rst | 114 ++++++++++++++++++ 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 Documentation/translations/zh_CN/security/tpm/xen-tpmfront.rst diff --git a/Documentation/translations/zh_CN/security/tpm/index.rst b/Documentation/translations/zh_CN/security/tpm/index.rst index dd80816f8af88..bf95200ca5868 100644 --- a/Documentation/translations/zh_CN/security/tpm/index.rst +++ b/Documentation/translations/zh_CN/security/tpm/index.rst @@ -16,7 +16,7 @@ tpm-security tpm_tis tpm_vtpm_proxy + xen-tpmfront TODOLIST: -* xen-tpmfront * tpm_ftpm_tee diff --git a/Documentation/translations/zh_CN/security/tpm/xen-tpmfront.rst b/Documentation/translations/zh_CN/security/tpm/xen-tpmfront.rst new file mode 100644 index 0000000000000..fa085d98a99b7 --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/xen-tpmfront.rst @@ -0,0 +1,114 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/xen-tpmfront.rst + +:翻译: + 赵硕 Shuo Zhao + +================ +Xen的虚拟TPM接口 +================ + +作者:Matthew Fioravante (JHUAPL), Daniel De Graaf (NSA) + +本文档描述了用于Xen的虚拟可信平台模块(vTPM)子系统。假定读者熟悉 +Xen和Linux的构建和安装,并对TPM和vTPM概念有基本的理解。 + +介绍 +---- + +这项工作的目标是为虚拟客户操作系统(在Xen中称为DomU)提供TPM功能。这使得 +程序能够像与物理系统上的TPM交互一样,与虚拟系统中的TPM进行交互。每个客户 +操作系统都会获得一个唯一的、模拟的软件TPM。然而,vTPM的所有秘密(如密钥、 +NVRAM 等)由vTPM管理域进行管理,该域将这些秘密封存到物理TPM中。如果创建这 +些域(管理域、vTPM域和客户域)的过程是可信的,vTPM子系统就能将根植于硬件 +TPM的信任链扩展到Xen中的虚拟机。vTPM的每个主要组件都作为一个独立的域实现, +从而通过虚拟机监控程序(hypervisor)提供安全隔离。 + +这个mini-os vTPM 子系统是建立在IBM和Intel公司之前的vTPM工作基础上的。 + + +设计概述 +-------- + +vTPM的架构描述如下:: + + +------------------+ + | Linux DomU | ... + | | ^ | + | v | | + | xen-tpmfront | + +------------------+ + | ^ + v | + +------------------+ + | mini-os/tpmback | + | | ^ | + | v | | + | vtpm-stubdom | ... + | | ^ | + | v | | + | mini-os/tpmfront | + +------------------+ + | ^ + v | + +------------------+ + | mini-os/tpmback | + | | ^ | + | v | | + | vtpmmgr-stubdom | + | | ^ | + | v | | + | mini-os/tpm_tis | + +------------------+ + | ^ + v | + +------------------+ + | Hardware TPM | + +------------------+ + +* Linux DomU: + 希望使用vTPM的基于Linux的客户机。可能有多个这样的实例。 + +* xen-tpmfront.ko: + Linux内核虚拟TPM前端驱动程序。该驱动程序为基于Linux的DomU提供 + vTPM访问。 + +* mini-os/tpmback: + Mini-os TPM后端驱动程序。Linux前端驱动程序通过该后端驱动程序连 + 接,以便在Linux DomU和其vTPM之间进行通信。该驱动程序还被 + vtpmmgr-stubdom用于与vtpm-stubdom通信。 + +* vtpm-stubdom: + 一个实现vTPM的mini-os存根域。每个正在运行的vtpm-stubdom实例与系统 + 上的逻辑vTPM之间有一一对应的关系。vTPM平台配置寄存器(PCRs)通常都 + 初始化为零。 + +* mini-os/tpmfront: + Mini-os TPM前端驱动程序。vTPM mini-os域vtpm-stubdom使用该驱动程序 + 与vtpmmgr-stubdom通信。此驱动程序还用于与vTPM域通信的mini-os域,例 + 如 pv-grub。 + +* vtpmmgr-stubdom: + 一个实现vTPM管理器的mini-os域。系统中只有一个vTPM管理器,并且在整个 + 机器生命周期内应一直运行。此域调节对系统中物理TPM的访问,并确保每个 + vTPM的持久状态。 + +* mini-os/tpm_tis: + Mini-osTPM1.2版本TPM 接口规范(TIS)驱动程序。该驱动程序由vtpmmgr-stubdom + 用于直接与硬件TPM通信。通信通过将硬件内存页映射到vtpmmgr-stubdom来实现。 + +* 硬件TPM: + 固定在主板上的物理 TPM。 + +与Xen的集成 +----------- + +vTPM驱动程序的支持已在Xen4.3中通过libxl工具堆栈添加。有关设置vTPM和vTPM +管理器存根域的详细信息,请参见Xen文档(docs/misc/vtpm.txt)。一旦存根域 +运行,与磁盘或网络设备相同,vTPM设备将在域的配置文件中进行设置 + +为了使用诸如IMA(完整性测量架构)等需要在initrd之前加载TPM的功能,必须将 +xen-tpmfront驱动程序编译到内核中。如果不使用这些功能,驱动程序可以作为 +模块编译,并像往常一样加载。 -- GitLab From 03069bf12823a950b9fe7b4903fc68eb0555c73d Mon Sep 17 00:00:00 2001 From: Shuo Zhao Date: Thu, 23 Jan 2025 13:17:45 +0800 Subject: [PATCH 035/934] docs/zh_CN: Add tpm tpm_ftpm_tee Chinese translation Translate .../security/tpm/tpm_ftpm_tee.rst into Chinese. Update the translation through commit e8bd417aab0c ("tpm/tpm_ftpm_tee: Document fTPM TEE driver") Reviewed-by: Alex Shi Signed-off-by: Shuo Zhao Reviewed-by: Yanteng Si Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/5de6312d2b9d23646eb306e74bae3f80ff28941d.1737603330.git.zhaoshuo@cqsoftware.com.cn --- .../translations/zh_CN/security/tpm/index.rst | 4 +-- .../zh_CN/security/tpm/tpm_ftpm_tee.rst | 31 +++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 Documentation/translations/zh_CN/security/tpm/tpm_ftpm_tee.rst diff --git a/Documentation/translations/zh_CN/security/tpm/index.rst b/Documentation/translations/zh_CN/security/tpm/index.rst index bf95200ca5868..7076465906470 100644 --- a/Documentation/translations/zh_CN/security/tpm/index.rst +++ b/Documentation/translations/zh_CN/security/tpm/index.rst @@ -17,6 +17,4 @@ tpm_tis tpm_vtpm_proxy xen-tpmfront - -TODOLIST: -* tpm_ftpm_tee + tpm_ftpm_tee diff --git a/Documentation/translations/zh_CN/security/tpm/tpm_ftpm_tee.rst b/Documentation/translations/zh_CN/security/tpm/tpm_ftpm_tee.rst new file mode 100644 index 0000000000000..5901eee32563e --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/tpm_ftpm_tee.rst @@ -0,0 +1,31 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/tpm_ftpm_tee.rst + +:翻译: + 赵硕 Shuo Zhao + +=========== +固件TPM驱动 +=========== + +本文档描述了固件可信平台模块(fTPM)设备驱动。 + +介绍 +==== + +该驱动程序是用于ARM的TrustZone环境中实现的固件的适配器。该驱动 +程序允许程序以与硬件TPM相同的方式与TPM进行交互。 + +设计 +==== + +该驱动程序充当一个薄层,传递命令到固件实现的TPM并接收其响应。驱动 +程序本身并不包含太多逻辑,更像是固件与内核/用户空间之间的一个管道。 + +固件本身基于以下论文: +https://www.microsoft.com/en-us/research/wp-content/uploads/2017/06/ftpm1.pdf + +当驱动程序被加载时,它会向用户空间暴露 ``/dev/tpmX`` 字符设备,允许 +用户空间通过该设备与固件TPM进行通信。 -- GitLab From f5c7cc77acf5abebc5aec4f4236954c23f29e09b Mon Sep 17 00:00:00 2001 From: Shuo Zhao Date: Tue, 14 Jan 2025 10:28:42 +0800 Subject: [PATCH 036/934] docs/zh_CN: Add security credentials Chinese translation Translate .../security/credentials.rst into Chinese. Update the translation through commit cf92ec602ac5 ("Documentation: remove current_security() reference") Reviewed-by: Yanteng Si Reviewed-by: Alex Shi Signed-off-by: Shuo Zhao Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20250114022843.22489-1-zhaoshuo@cqsoftware.com.cn --- .../zh_CN/security/credentials.rst | 479 ++++++++++++++++++ .../translations/zh_CN/security/index.rst | 2 +- 2 files changed, 480 insertions(+), 1 deletion(-) create mode 100644 Documentation/translations/zh_CN/security/credentials.rst diff --git a/Documentation/translations/zh_CN/security/credentials.rst b/Documentation/translations/zh_CN/security/credentials.rst new file mode 100644 index 0000000000000..91c353dfb6221 --- /dev/null +++ b/Documentation/translations/zh_CN/security/credentials.rst @@ -0,0 +1,479 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/security/credentials.rst + +:翻译: + 赵硕 Shuo Zhao + +============= +Linux中的凭据 +============= + +作者: David Howells + +.. contents:: :local: + +概述 +==== + +当一个对象对另一个对象进行操作时,Linux执行的安全检查包含几个部分: + + 1. 对象 + + 对象是可以直接由用户空间程序操作的系统中的实体。Linux具有多种可操作 + 的对象,包括: + + - 任务 + - 文件/索引节点 + - 套接字 + - 消息队列 + - 共享内存段 + - 信号量 + - 密钥 + + 所有这些对象的描述的一部分是一组凭据。集合中的内容取决于对象的类型。 + + 2. 对象所有权 + + 大多数对象的凭据中会有一个子集用来表示该对象的所有权。 + 这用于资源核算和限制(如磁盘配额和任务资源限制)。 + + 例如,在标准的UNIX文件系统中,这将由标记在索引节点上的UID定义。 + + 3. 对象上下文 + + 此外在这些对象的凭据中,将有一个子集表示对象的“对象上下文”。 + 这可能与(2)中相同,也可能不同 —— 例如,在标准的UNIX文件中, + 这是由标记在索引节点上的UID和GID定义的。 + + 对象上下文是进行安全计算的一部分,当对象被操作时会用到。 + + 4. 主体 + + 主体是正在对其他对象执行操作的对象。 + + 系统中的大多数对象是不活动的:他们不会对系统中的其他对象起作用。 + 进程/任务是明显的例外:它们可以访问和操纵其他对象。 + + 任务之外的其他对象在某些情况下也可以是主体。例如,打开的文件可以使用 + 名为 ``fcntl(F_SETOWN)`` 的任务给它的UID和EUID向一个任务发送SIGIO + 信号。在这种情况下,文件结构也会有一个主体上下文。 + + 5. 主体上下文 + + 主体对其凭据有一个额外的解释。其凭据的一个子集形成了“主体上下文”。主体 + 上下文在主体执行操作时作为安全计算的一部分使用。 + + 例如,Linux任务在操作文件时会有FSUID、FSGID和附加组列表 —— 这些凭据 + 与通常构成任务的对象上下文的真实UID和GID是相互独立的。 + + 6. 操作 + + Linux提供许多操作,主体可以对对象执行这些操作。可用的操作集取决于主体 + 和对象的性质。 + + + 操作包括读取、写入、创建和删除文件,以及派生(forking)或发送 + 信号(signalling)和跟踪(tracing)任务等。 + + 7. 规则,访问控制列表和安全计算 + + 当主体对对象进行操作时,会进行安全计算。这涉及到使用主体上下文、对象 + 上下文和操作,并搜索一个或多个规则集,以确定在给定这些上下文的情况下, + 主体是否被授予或拒绝以所需方式对对象进行操作的权限。 + + 主要有两个规则来源: + + a. 自主访问控制(DAC): + + 有时,对象的描述中会包含一组规则。这就是所谓的“访问控制列表”或‘ACL’。 + 一个Linux文件可以提供多个ACL。 + + 例如,传统的UNIX文件包括一个权限掩码,它是一个简化的ACL,具有三个固定的 + 主体类别(“用户”、“组”和“其他”),每一个都可以被授予一定的特权(如“读取”、 + “写入”和“执行” —— 无论这些映射对于对象意味着什么)。然而,UNIX文件权限不 + 允许任意指定主体,因此用途有限。 + + Linux文件还可以支持POSIX ACL。这是一个规则列表,为任意主体授予各种权限。 + + b. 强制访问控制(MAC): + + 整个系统可能有一个或多个规则集,适用于所有主体和对象,不考虑它们的来源。 + SELinux和Smack就是这种情况的例子。 + + 在SELinux和Smack的情况下,每个对象在其凭据中都被赋予一个标签。当请求执 + 行操作时,它们使用主体标签、对象标签和操作,寻找一个规则,该规则表示此操 + 作是授予还是拒绝的。 + + +凭据类型 +======== + +Linux内核支持以下类型的凭据: + + 1. 传统的UNIX凭据。 + + - 真实用户ID + - 真实组ID + + UID和GID由大多数(如果不是全部)Linux对象携带,即使有时它们需要被虚构出 + 来(例如FAT或CIFS文件,这些文件来源于Windows)。这些(通常)定义了该对象 + 的对象上下文,但任务在某些情况下略有不同。 + + - 有效用户ID,保存用户ID和FS用户ID + - 有效组ID,保存组ID和FS组ID + - 补充组 + + 这些是仅由任务使用的额外凭据。通常,一个EUID/EGID/GROUPS 被用作主体上下文, + 而真实UID/GID 被用作对象上下文。对于任务,这并不总是正确的。 + + 2. 能力 + + - 允许的能力集合 + - 可继承的能力集合 + - 有效的能力集合 + - 能力边界集合 + + 这些仅由任务携带,表示授予任务的超出普通任务权限的能力。这些可以通过传统 + UNIX凭据的更改进行隐式操作,但也可以通过 ``capset()`` 系统调用直接操作。 + + 允许的能力是指进程可以通过 ``capset()`` 将其添加到其有效或允许集合中的 + 那些能力。这个可继承的集合也可能受到这样的限制。 + + 有效能力是任务本身实际可以使用的能力。 + + 可继承能力是那些可以通过 ``execve()`` 传递的能力。 + + 边界集限制了通过 ``execve()`` 继承的能力,特别是在以UID 0执行二进制文件时。 + + 3. 安全管理标记(securebits) + + 它们用于控制上述凭据在特定操作如execve()中的操作和继承方式。它们并不直接 + 用作对象或主体凭据使用。 + + 4. 密钥和密钥环 + + 这些仅由任务携带。它们用于携带和缓存不适合放入其他标准UNIX凭据中的安全令牌。 + 它们用诸如使网络文件系统密钥在进程执行的文件访问时可用,而无需让普通程序了解 + 涉及的安全细节。 + + 密钥环是一种特殊类型的密钥。它们携带一组其他密钥,并可以搜索来查找所需的密钥。 + 每个进程可以订阅多个密钥环: + + 每线程密钥 + 每进程密钥环 + 每会话密钥环 + + 当进程访问一个密钥时,若尚不存在,则通常会将其缓存在一个密钥环中,以便将来的 + 访问时找到该密钥。 + + 有关密钥的更多信息,请参见 ``Documentation/translations/zh_CN/security/keys/*`` 。 + + 5. LSM + + Linux安全模块允许在任务执行操作时施加额外的控制。目前,Linux支持几种LSM选项。 + + 一些工作通过标记系统中的对象,并应用一组规则(策略)说明某个标签的任务可以对 + 另一标签的对象执行哪些操作。 + + 6. AF_KEY + + 这是一种基于套接字网络协议栈中的凭据管理[RFC 2367]。本文档中没有讨论它,因为不 + 直接与任务和文件凭据进行交互,而是保留了系统级的凭据。 + + +当打开一个文件时,打开任务的主体上下文的一部分会记录在创建的文件结构中。 +这使得使用该文件结构的操作可以使用这些凭据,而不是发出操作的任务的主体上下文。 +一个例子是在网络文件系统上打开的文件,打开文件的凭据应该被呈现给服务器,而不管 +实际进行读取或写入操作的是谁。 + + +文件标记 +======== + +存储在磁盘上或通过网络获取的文件可能具有注释,构成该文件的对象安全上下文。 +根据文件系统的类型,这些注释可能包括以下一项或多项: + + * UNIX UID, GID, mode; + * Windows user ID; + * Access control list; + * LSM security label; + * UNIX exec privilege escalation bits (SUID/SGID); + * File capabilities exec privilege escalation bits. + +将这些与任务的主体安全上下文进行比较,并根据比较结果允许或禁止执行某些操作。 +在execve()的情况下,特权提升位起作用,并且可能允许由可执行文件的注释决定的 +进程获得额外的特权。 + + +任务凭据 +======== + +在Linux中,一个任务的所有凭据都保存在一个引用计数结构体‘struct cred’中, +通过(uid, gid)或(groups, keys, LSM security)进行访问。每个任务在其 +task_struct中通过一个名为‘cred’的指针指向其凭据。 + +一旦一组凭据已经准备好并提交,除非以下几种情况,否则不能更改: + + 1. 其引用计数可以更改; + + 2. 它所指向的 group_info 结构体的引用计数可以更改; + + 3. 它所指向的安全数据的引用计数可以更改; + + 4. 它所指向的任何密钥环的引用计数可以更改; + + 5. 它所指向的任何密钥环可以被撤销、过期或其安全属性可以更改; + + 6. 它所指向的任何密钥环的内容可以更改(密钥环的整个目的就是作为一组共享凭据, + 可由具有适当访问权限的任何人修改)。 + +要更改cred结构体中的任何内容,必须遵循复制和替换的原则。首先进行复制,然后修 +改副本,最后使用RCU(读-复制-更新)将任务指针更改为指向新的副本。有一些封装可 +用于帮助执行这个过程(见下文)。 + +一个任务只能修改自己的凭据;不再允许一个任务修改另一个任务的凭据。 +这意味着 ``capset()`` 系统调用不再允许使用除当前进程之外的任何PID。 +此外, ``keyctl_instantiate()`` 和 ``keyctl_negate()`` 函数也不再 +允许在请求进程中附加到特定于进程的密钥环,因为实例化进程可能需要创建它们。 + + +不可变凭据 +---------- + +一旦一组凭据已经被公开(例如通过调用 ``commit_creds()`` ),必须将其视为 +不可变的,除了两个例外情况: + + 1. 引用计数可以被修改。 + + 2. 虽然无法更改一组凭据的密钥环订阅,但订阅的密钥环的内容可以被更改。 + +为了在编译时捕获意外的凭据修改,struct task_struct具有_const_指针指向其凭据集, +struct file也是如此。此外,某些函数如 ``get_cred()`` 和 ``put_cred()`` 在 +const指针上操作,因此不需要进行类型转换,但需要临时放弃const限定,以便能够修改 +引用计数。 + + +访问任务凭据 +------------ + +任务只能修改自己的凭据,允许当前进程可以读取或替换自己的凭据,无需任何形式锁定的 +情况下 —— 这极大简化了事情。它可以调用:: + + const struct cred *current_cred() + +获取指向其凭据结构的指针,并且之后不必释放它。 + +有一些方便的封装用于检索任务凭据的特定方面(在每种情况下都只返回值):: + + uid_t current_uid(void) Current's real UID + gid_t current_gid(void) Current's real GID + uid_t current_euid(void) Current's effective UID + gid_t current_egid(void) Current's effective GID + uid_t current_fsuid(void) Current's file access UID + gid_t current_fsgid(void) Current's file access GID + kernel_cap_t current_cap(void) Current's effective capabilities + struct user_struct *current_user(void) Current's user account + +还有一些方便的封装,用于检索任务凭据的特定关联对:: + + void current_uid_gid(uid_t *, gid_t *); + void current_euid_egid(uid_t *, gid_t *); + void current_fsuid_fsgid(uid_t *, gid_t *); + +在从当前任务的凭据中检索后,通过其参数返回这些值对。 + + +此外,还有一个函数用于获取当前进程的当前凭据集的引用:: + + const struct cred *get_current_cred(void); + +以及用于获取对一个实际上不存在于struct cred中的凭据的引用的函数:: + + struct user_struct *get_current_user(void); + struct group_info *get_current_groups(void); + +分别获得对当前进程的 user accounting structure 和补充组列表的引用。 + +一旦获得引用,就必须使用 ``put_cred()``, ``free_uid()`` 或 +``put_group_info()`` 来适当释放它。 + + +访问其他任务的凭据 +------------------ + +虽然一个任务可以在不需要锁定的情况下访问自己的凭据,但想要访问另一个任务 +的凭据的任务并非如此。它必须使用RCU读锁和 ``rcu_dereference()``。 + +``rcu_dereference()`` 是由:: + + const struct cred *__task_cred(struct task_struct *task); + +这应该在RCU读锁中使用,如下例所示:: + + void foo(struct task_struct *t, struct foo_data *f) + { + const struct cred *tcred; + ... + rcu_read_lock(); + tcred = __task_cred(t); + f->uid = tcred->uid; + f->gid = tcred->gid; + f->groups = get_group_info(tcred->groups); + rcu_read_unlock(); + ... + } + +如果需要长时间持有另一个任务的凭据,并且可能在此过程中休眠,则调用方 +应该使用以下函数来获取对这些凭据的引用:: + + const struct cred *get_task_cred(struct task_struct *task); + +这个函数内部完成了所有的RCU操作。当使用完这些凭据时,调用方必须调用put_cred() +函数释放它们。 + +.. note:: + ``__task_cred()`` 的结果不应直接传递给 ``get_cred()`` , + 因为这可能与 ``commit_cred()`` 发生竞争条件。 + +还有一些方便的函数可以访问另一个任务凭据的特定部分,将RCU操作对调用方隐藏起来:: + + uid_t task_uid(task) Task's real UID + uid_t task_euid(task) Task's effective UID + +如果调用方在此时已经持有RCU读锁,则应使用:: + + __task_cred(task)->uid + __task_cred(task)->euid + +类似地,如果需要访问任务凭据的多个方面,应使用RCU读锁,调用 ``__task_cred()`` +函数,将结果存储在临时指针中,然后从临时指针中调用凭据的各个方面,最后释放锁。 +这样可以防止多次调用昂贵的RCU操作。 + +如果需要访问另一个任务凭据的其他单个方面,可以使用:: + + task_cred_xxx(task, member) + +这里的‘member’是cred结构体的非指针成员。例如:: + + uid_t task_cred_xxx(task, suid); + +将从任务中检索‘struct cred::suid’,并执行适当的RCU操作。对于指针成员, +不能使用这种形式,因为它们指向的内容可能在释放RCU读锁的瞬间消失。 + + +修改凭据 +-------- + +如先前提到的,一个任务只能修改自己的凭据,不能修改其他任务的凭据。这意味 +着它不需要使用任何锁来修改自己的凭据。 + +要修改当前进程的凭据,函数应首先调用:: + + struct cred *prepare_creds(void); + +这将锁定current->cred_replace_mutex,然后分配并构建当前进程凭据的副本。 +如果成功,函数返回时仍然保持互斥锁。如果不成功(内存不足),则返回NULL。 + +互斥锁防止 ``ptrace()`` 在进行凭据构建和更改的安全检查时更改进程的ptrace +状态,因为ptrace状态可能会改变结果,特别是在 ``execve()`` 的情况下。 + +新的凭据集应适当地进行修改,并进行任何安全检查和挂钩。在此时,当前和建议的 +凭据集都可用,因为current_cred()将返回当前的凭据集。 + +在替换组列表时,必须在将其添加到凭据之前对新列表进行排序,因为使用二分查找 +测试成员资格。实际上,这意味着在set_groups()或set_current_groups()之 +前应调用groups_sort()。groups_sort()不能在共享的 ``struct group_list`` +上调用,因为即使数组已经排序,它也可能作为排序过程的一部分对元素进行排列。 + +当凭据集准备好时,应通过调用以下函数将其提交给当前进程:: + + int commit_creds(struct cred *new); + +这将修改凭据和进程的各个方面,给LSM提供机会做同样的修改,然后使用 +``rcu_assign_pointer()`` 将新的凭据实际提交给 ``current->cred`` , +释放 ``current->cred_replace_mutex`` 以允许 ``ptrace()`` 进行操 +作,并通知调度程序和其他组件有关更改的情况。 + +该函数保证返回0,以便可以在诸如 ``sys_setresuid()`` 函数的末尾进行尾调用。 + +请注意,该函数会消耗调用者对新凭据的引用。调用者在此之后不应调用 +``put_cred()`` 释放新凭据。 + +此外,一旦新的凭据上调用了该函数,就不能进一步更改这些凭据。 + + +如果在调用 ``prepare_creds()`` 之后安全检查失败或发生其他错误, +则应调用以下函数:: + + void abort_creds(struct cred *new); + +这将释放 ``prepare_creds()`` 获取的 ``current->cred_replace_mutex`` 的锁, +并释放新的凭据。 + +一个典型的凭据修改函数看起来像这样:: + + int alter_suid(uid_t suid) + { + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->suid = suid; + ret = security_alter_suid(new); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); + } + + +管理凭据 +-------- + +有一些函数用来辅助凭据管理: + + - ``void put_cred(const struct cred *cred);`` + + 这将释放对给定凭据集的引用。如果引用计数为零,凭据集将由 + RCU系统安排进行销毁。 + + - ``const struct cred *get_cred(const struct cred *cred);`` + + 这将获取对活动凭据集的引用。返回指向凭据集的指针。 + + - ``struct cred *get_new_cred(struct cred *cred);`` + + 这将获取对当前正在构建且可变的凭据集的引用。返回指向凭据集的指针。 + +打开文件凭据 +============ + +当打开新文件时,会获取对打开任务凭据的引用,并将其附加到文件结构体的 +``f_cred`` 字段中,替代原来的 ``f_uid`` 和 ``f_gid`` 。原来访问 +``file->f_uid`` 和 ``file->f_gid`` 的代码现在应访问 ``file->f_cred->fsuid`` +和 ``file->f_cred->fsgid`` 。 + +安全访问 ``f_cred`` 的情况下可以不使用RCU或加锁,因为指向凭据的指针 +以及指向的凭据结构的内容在文件结构的整个生命周期中保持不变,除非是 +上述列出的例外情况(参阅任务凭据部分)。 + +为了避免“混淆代理”权限提升攻击,在打开的文件后续操作时,访问控制检查 +应该使用这些凭据,而不是使用“当前”的凭据,因为该文件可能已经被传递给 +一个更具特权的进程。 + +覆盖VFS对凭据的使用 +=================== + +在某些情况下,需要覆盖VFS使用的凭据,可以通过使用不同的凭据集调用 +如 ``vfs_mkdir()`` 来实现。以下是一些进行此操作的位置: + + * ``sys_faccessat()``. + * ``do_coredump()``. + * nfs4recover.c. diff --git a/Documentation/translations/zh_CN/security/index.rst b/Documentation/translations/zh_CN/security/index.rst index 949794b5eec1d..0aacecabf0c0e 100644 --- a/Documentation/translations/zh_CN/security/index.rst +++ b/Documentation/translations/zh_CN/security/index.rst @@ -15,6 +15,7 @@ .. toctree:: :maxdepth: 1 + credentials lsm sak siphash @@ -23,7 +24,6 @@ landlock TODOLIST: -* credentials * snp-tdx-threat-model * IMA-templates * keys/index -- GitLab From b48e0f696b710bf75610bff189c4aad1aeaf6d75 Mon Sep 17 00:00:00 2001 From: Chen Pei Date: Wed, 8 Jan 2025 10:03:42 +0800 Subject: [PATCH 037/934] Documentation: riscv: Remove KPROBES_ON_FTRACE Since commit 7caa9765465f60 ("ftrace: riscv: move from REGS to ARGS"), kprobe on ftrace is not supported by riscv. And commit 3308172276db5d ("trace: riscv: Remove deprecated kprobe on ftrace support") removed the relevant code, but left out the documentation, so fix that. Signed-off-by: Chen Pei Reviewed-by: Charlie Jenkins Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20250108020342.4172-1-cp0613@linux.alibaba.com --- Documentation/features/debug/kprobes-on-ftrace/arch-support.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt index 02febc883588e..d937b7a035757 100644 --- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt +++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt @@ -20,7 +20,7 @@ | openrisc: | TODO | | parisc: | ok | | powerpc: | ok | - | riscv: | ok | + | riscv: | TODO | | s390: | ok | | sh: | TODO | | sparc: | TODO | -- GitLab From e8bcda12176c47f2ce6c5104955845d028a640e8 Mon Sep 17 00:00:00 2001 From: Diego Viola Date: Tue, 4 Feb 2025 01:04:43 -0300 Subject: [PATCH 038/934] docs: admin-guide: rename GTK+ to GTK Upstream calls it "GTK" now, see [1] [1] https://mail.gnome.org/archives/gtk-devel-list/2019-February/msg00000.html Signed-off-by: Diego Viola Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20250204040444.6299-1-diego.viola@gmail.com --- Documentation/admin-guide/README.rst | 2 +- Documentation/translations/zh_CN/admin-guide/README.rst | 2 +- Documentation/translations/zh_TW/admin-guide/README.rst | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst index eb94526689091..d30867620cea7 100644 --- a/Documentation/admin-guide/README.rst +++ b/Documentation/admin-guide/README.rst @@ -165,7 +165,7 @@ Configuring the kernel "make xconfig" Qt based configuration tool. - "make gconfig" GTK+ based configuration tool. + "make gconfig" GTK based configuration tool. "make oldconfig" Default all questions based on the contents of your existing ./.config file and asking about diff --git a/Documentation/translations/zh_CN/admin-guide/README.rst b/Documentation/translations/zh_CN/admin-guide/README.rst index e679cbc3c89d0..1bdafdc4c8e23 100644 --- a/Documentation/translations/zh_CN/admin-guide/README.rst +++ b/Documentation/translations/zh_CN/admin-guide/README.rst @@ -146,7 +146,7 @@ Linux内核6.x版本 "make xconfig" 基于Qt的配置工具。 - "make gconfig" 基于GTK+的配置工具。 + "make gconfig" 基于GTK的配置工具。 "make oldconfig" 基于现有的 ./.config 文件选择所有选项,并询问 新配置选项。 diff --git a/Documentation/translations/zh_TW/admin-guide/README.rst b/Documentation/translations/zh_TW/admin-guide/README.rst index a6e34c200ea32..0b038074d9d1f 100644 --- a/Documentation/translations/zh_TW/admin-guide/README.rst +++ b/Documentation/translations/zh_TW/admin-guide/README.rst @@ -149,7 +149,7 @@ Linux內核6.x版本 "make xconfig" 基於Qt的配置工具。 - "make gconfig" 基於GTK+的配置工具。 + "make gconfig" 基於GTK的配置工具。 "make oldconfig" 基於現有的 ./.config 文件選擇所有選項,並詢問 新配置選項。 -- GitLab From 1b133129ad6b28186214259af3bd5fc651a85509 Mon Sep 17 00:00:00 2001 From: E Shattow Date: Mon, 9 Dec 2024 20:19:56 -0800 Subject: [PATCH 039/934] riscv: dts: starfive: Fix a typo in StarFive JH7110 pin function definitions Fix a typo in StarFive JH7110 pin function definitions for GPOUT_SYS_SDIO1_DATA4 Fixes: e22f09e598d12 ("riscv: dts: starfive: Add StarFive JH7110 pin function definitions") Signed-off-by: E Shattow Acked-by: Hal Feng CC: stable@vger.kernel.org Signed-off-by: Conor Dooley --- arch/riscv/boot/dts/starfive/jh7110-pinfunc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/boot/dts/starfive/jh7110-pinfunc.h b/arch/riscv/boot/dts/starfive/jh7110-pinfunc.h index 256de17f52611..ae49c908e7fb3 100644 --- a/arch/riscv/boot/dts/starfive/jh7110-pinfunc.h +++ b/arch/riscv/boot/dts/starfive/jh7110-pinfunc.h @@ -89,7 +89,7 @@ #define GPOUT_SYS_SDIO1_DATA1 59 #define GPOUT_SYS_SDIO1_DATA2 60 #define GPOUT_SYS_SDIO1_DATA3 61 -#define GPOUT_SYS_SDIO1_DATA4 63 +#define GPOUT_SYS_SDIO1_DATA4 62 #define GPOUT_SYS_SDIO1_DATA5 63 #define GPOUT_SYS_SDIO1_DATA6 64 #define GPOUT_SYS_SDIO1_DATA7 65 -- GitLab From 824d4f2dce50da4b748c1e280cb50e4b82146ce7 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 4 Feb 2025 14:20:51 +0900 Subject: [PATCH 040/934] sched_ext: Add an event, SCX_EV_ENQ_SKIP_EXITING Add a core event, SCX_EV_ENQ_SKIP_EXITING, which represents how many times a task is enqueued to a local DSQ when exiting if SCX_OPS_ENQ_EXITING is not set. __scx_add_event() is used since the caller holds an rq lock, so the preemption has already been disabled. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index cbb3d059072fb..c72ba405a1d38 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1461,6 +1461,12 @@ struct scx_event_stats { * continued to run because there were no other tasks on the CPU. */ u64 SCX_EV_DISPATCH_KEEP_LAST; + + /* + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task + * is dispatched to a local DSQ when exiting. + */ + u64 SCX_EV_ENQ_SKIP_EXITING; }; /* @@ -2068,8 +2074,10 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, /* see %SCX_OPS_ENQ_EXITING */ if (!static_branch_unlikely(&scx_ops_enq_exiting) && - unlikely(p->flags & PF_EXITING)) + unlikely(p->flags & PF_EXITING)) { + __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1); goto local; + } if (!SCX_HAS_OP(enqueue)) goto global; @@ -4985,6 +4993,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) memcpy(ei->dump + dump_len - sizeof(trunc_marker), @@ -7121,6 +7130,7 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING); } /* -- GitLab From 4f7a38c7c917436bb40f10c251a1a973e2f1ada8 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 4 Feb 2025 14:20:52 +0900 Subject: [PATCH 041/934] sched_ext: Add an event, SCX_EV_BYPASS_ACTIVATE Add a core event, SCX_EV_BYPASS_ACTIVATE, which represents how many times the bypass mode has been triggered. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index c72ba405a1d38..377c2e99cf4fa 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1467,6 +1467,11 @@ struct scx_event_stats { * is dispatched to a local DSQ when exiting. */ u64 SCX_EV_ENQ_SKIP_EXITING; + + /* + * The number of times the bypassing mode has been activated. + */ + u64 SCX_EV_BYPASS_ACTIVATE; }; /* @@ -4399,6 +4404,7 @@ static void scx_ops_bypass(bool bypass) WARN_ON_ONCE(scx_ops_bypass_depth <= 0); if (scx_ops_bypass_depth != 1) goto unlock; + scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1); } else { scx_ops_bypass_depth--; WARN_ON_ONCE(scx_ops_bypass_depth < 0); @@ -4994,6 +5000,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); + scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) memcpy(ei->dump + dump_len - sizeof(trunc_marker), @@ -7131,6 +7138,7 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE); } /* -- GitLab From 5c605cd33cad957f3eff747e088d07db23808080 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 4 Feb 2025 14:20:53 +0900 Subject: [PATCH 042/934] sched_ext: Add an event, SCX_EV_BYPASS_DISPATCH Add a core event, SCX_EV_BYPASS_DISPATCH, which represents how many tasks have been dispatched in the bypass mode. __scx_add_event() is used since the caller holds an rq lock or p->pi_lock, so the preemption has already been disabled. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 377c2e99cf4fa..0ed2db44ae335 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1468,6 +1468,11 @@ struct scx_event_stats { */ u64 SCX_EV_ENQ_SKIP_EXITING; + /* + * The number of tasks dispatched in the bypassing mode. + */ + u64 SCX_EV_BYPASS_DISPATCH; + /* * The number of times the bypassing mode has been activated. */ @@ -2071,8 +2076,10 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (scx_rq_bypassing(rq)) + if (scx_rq_bypassing(rq)) { + __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); goto global; + } if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct; @@ -3253,6 +3260,8 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { + bool rq_bypass; + /* * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it * can be a good migration opportunity with low cache and memory @@ -3266,7 +3275,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { + rq_bypass = scx_rq_bypassing(task_rq(p)); + if (SCX_HAS_OP(select_cpu) && !rq_bypass) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3292,6 +3302,9 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag p->scx.slice = SCX_SLICE_DFL; p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; } + + if (rq_bypass) + __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); return cpu; } } @@ -5000,6 +5013,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); + scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) @@ -7138,6 +7152,7 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH); scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE); } -- GitLab From d46457c31c432b6b717dcaebab634d5126e2969a Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 4 Feb 2025 14:20:54 +0900 Subject: [PATCH 043/934] sched_ext: Add an event, SCX_EV_BYPASS_DURATION Add a core event, SCX_EV_BYPASS_DURATION, which represents the total duration of bypass modes in nanoseconds. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 0ed2db44ae335..8a9a30895381a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1468,6 +1468,11 @@ struct scx_event_stats { */ u64 SCX_EV_ENQ_SKIP_EXITING; + /* + * The total duration of bypass modes in nanoseconds. + */ + u64 SCX_EV_BYPASS_DURATION; + /* * The number of tasks dispatched in the bypassing mode. */ @@ -4408,6 +4413,8 @@ static void scx_clear_softlockup(void) static void scx_ops_bypass(bool bypass) { static DEFINE_RAW_SPINLOCK(bypass_lock); + static unsigned long bypass_timestamp; + int cpu; unsigned long flags; @@ -4417,12 +4424,15 @@ static void scx_ops_bypass(bool bypass) WARN_ON_ONCE(scx_ops_bypass_depth <= 0); if (scx_ops_bypass_depth != 1) goto unlock; + bypass_timestamp = ktime_get_ns(); scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1); } else { scx_ops_bypass_depth--; WARN_ON_ONCE(scx_ops_bypass_depth < 0); if (scx_ops_bypass_depth != 0) goto unlock; + scx_add_event(SCX_EV_BYPASS_DURATION, + ktime_get_ns() - bypass_timestamp); } atomic_inc(&scx_ops_breather_depth); @@ -5013,6 +5023,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); + scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); @@ -7152,6 +7163,7 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION); scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH); scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE); } -- GitLab From 9865f31d852a40e39b8efb8feec69182e1002489 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 4 Feb 2025 14:20:55 +0900 Subject: [PATCH 044/934] sched_ext: Add scx_bpf_events() and scx_read_event() for BPF schedulers scx_bpf_events() is added to the header files so the BPF scheduler can use it. Also, scx_read_event() is added to read an event type in a compatible way. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/common.bpf.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index f254a39b86a58..7055400030241 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -78,6 +78,10 @@ struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; +void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; +#define scx_read_event(e, name) \ + (bpf_core_field_exists((e)->name) ? (e)->name : 0) + /* * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from * within bpf_for_each() loops. -- GitLab From 6df93804b718ec9741f056dae777f12332d54002 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 4 Feb 2025 14:20:56 +0900 Subject: [PATCH 045/934] sched_ext: Print core event count in scx_central scheduler Modify the scx_central scheduler to print the core event counter every second. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- tools/sched_ext/scx_central.bpf.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 50bc1737c167a..376c14d5dd0d9 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -256,6 +256,7 @@ static int central_timerfn(void *map, int *key, struct bpf_timer *timer) u64 now = scx_bpf_now(); u64 nr_to_kick = nr_queued; s32 i, curr_cpu; + struct scx_event_stats events; curr_cpu = bpf_get_smp_processor_id(); if (timer_pinned && (curr_cpu != central_cpu)) { @@ -291,6 +292,26 @@ static int central_timerfn(void *map, int *key, struct bpf_timer *timer) bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); __sync_fetch_and_add(&nr_timers, 1); + + /* print event counters every second */ + if (nr_timers % 1000 == 0) { + scx_bpf_events(&events, sizeof(events)); + + bpf_printk("%35s: %llu\n", "SCX_EV_SELECT_CPU_FALLBACK", + scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); + bpf_printk("%35s: %llu\n", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", + scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); + bpf_printk("%35s: %llu\n", "SCX_EV_DISPATCH_KEEP_LAST", + scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); + bpf_printk("%35s: %llu\n", "SCX_EV_ENQ_SKIP_EXITING", + scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); + bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_DURATION", + scx_read_event(&events, SCX_EV_BYPASS_DURATION)); + bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_DISPATCH", + scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); + bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_ACTIVATE", + scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + } return 0; } -- GitLab From 2494e555fbaadf1b02eb89f3bee0cec95516f6c2 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 4 Feb 2025 14:20:57 +0900 Subject: [PATCH 046/934] sched_ext: Print core event count in scx_qmap scheduler Modify the scx_qmap scheduler to print the core event counter every second. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- tools/sched_ext/scx_qmap.bpf.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 3a20bb0c014a1..5edb79742e379 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -763,6 +763,8 @@ static void dump_shared_dsq(void) static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) { + struct scx_event_stats events; + bpf_rcu_read_lock(); dispatch_highpri(true); bpf_rcu_read_unlock(); @@ -772,6 +774,23 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) if (print_shared_dsq) dump_shared_dsq(); + scx_bpf_events(&events, sizeof(events)); + + bpf_printk("%35s: %llu\n", "SCX_EV_SELECT_CPU_FALLBACK", + scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); + bpf_printk("%35s: %llu\n", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", + scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); + bpf_printk("%35s: %llu\n", "SCX_EV_DISPATCH_KEEP_LAST", + scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); + bpf_printk("%35s: %llu\n", "SCX_EV_ENQ_SKIP_EXITING", + scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); + bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_DURATION", + scx_read_event(&events, SCX_EV_BYPASS_DURATION)); + bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_DISPATCH", + scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); + bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_ACTIVATE", + scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + bpf_timer_start(timer, ONE_SEC_IN_NS, 0); return 0; } -- GitLab From 1ac98d6484c4a53e9b3ed1f59d03a2dd2d287b70 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 11 Jan 2025 18:23:26 +0100 Subject: [PATCH 047/934] soc: qcom: pmic_glink: Drop redundant pg assignment before taking lock Commit e9f826b0459f ("soc: qcom: pmic_glink: simplify locking with guard()") was on top of a fix [1] which was moving the 'pg = __pmic_glink' assignment into the critical section. Unfortunately the actual fix was not applied and instead rebased version of the next patch got in. The resulting code is in general correct, but now there is a duplicated assignment 'pg = __pmic_glink'. [1] https://lore.kernel.org/all/20240822164815.230167-1-krzysztof.kozlowski@linaro.org/ Fixes: e9f826b0459f ("soc: qcom: pmic_glink: simplify locking with guard()") Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250111172326.101779-1-krzysztof.kozlowski@linaro.org Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/pmic_glink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c index 052c292eeda61..cde19cdfd3c7f 100644 --- a/drivers/soc/qcom/pmic_glink.c +++ b/drivers/soc/qcom/pmic_glink.c @@ -233,7 +233,7 @@ static void pmic_glink_pdr_callback(int state, char *svc_path, void *priv) static int pmic_glink_rpmsg_probe(struct rpmsg_device *rpdev) { - struct pmic_glink *pg = __pmic_glink; + struct pmic_glink *pg; guard(mutex)(&__pmic_glink_lock); pg = __pmic_glink; -- GitLab From 7f048b202333b967782a98aa21bb3354dc379bbf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 15 Jan 2025 09:52:53 +0300 Subject: [PATCH 048/934] firmware: qcom: scm: Fix error code in probe() Set the error code if devm_qcom_tzmem_pool_new() fails. Don't return success. Fixes: 1e76b546e6fc ("firmware: qcom: scm: Cleanup global '__scm' on probe failures") Signed-off-by: Dan Carpenter Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/a0845467-4f83-4070-ab1e-ff7e6764609f@stanley.mountain Signed-off-by: Bjorn Andersson --- drivers/firmware/qcom/qcom_scm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/qcom/qcom_scm.c b/drivers/firmware/qcom/qcom_scm.c index f0569bb9411f7..fc4d67e4c4a67 100644 --- a/drivers/firmware/qcom/qcom_scm.c +++ b/drivers/firmware/qcom/qcom_scm.c @@ -2301,8 +2301,8 @@ static int qcom_scm_probe(struct platform_device *pdev) __scm->mempool = devm_qcom_tzmem_pool_new(__scm->dev, &pool_config); if (IS_ERR(__scm->mempool)) { - dev_err_probe(__scm->dev, PTR_ERR(__scm->mempool), - "Failed to create the SCM memory pool\n"); + ret = dev_err_probe(__scm->dev, PTR_ERR(__scm->mempool), + "Failed to create the SCM memory pool\n"); goto err; } -- GitLab From bea1d19f03644b0eb7ac0b7d96f79a3af13f196b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Dec 2024 08:47:43 -0800 Subject: [PATCH 049/934] doc: Add broken-timing possibility to stallwarn.rst Currently, stallwarn.rst does not mention the fact that timer bugs can result in false-positive RCU CPU stall warnings. This commit therefore adds this to the list. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- Documentation/RCU/stallwarn.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst index 30080ff6f4062..d1ccd6039a8c3 100644 --- a/Documentation/RCU/stallwarn.rst +++ b/Documentation/RCU/stallwarn.rst @@ -96,6 +96,13 @@ warnings: the ``rcu_.*timer wakeup didn't happen for`` console-log message, which will include additional debugging information. +- A timer issue causes time to appear to jump forward, so that RCU + believes that the RCU CPU stall-warning timeout has been exceeded + when in fact much less time has passed. This could be due to + timer hardware bugs, timer driver bugs, or even corruption of + the "jiffies" global variable. These sorts of timer hardware + and driver bugs are not uncommon when testing new hardware. + - A low-level kernel issue that either fails to invoke one of the variants of rcu_eqs_enter(true), rcu_eqs_exit(true), ct_idle_enter(), ct_idle_exit(), ct_irq_enter(), or ct_irq_exit() on the one -- GitLab From df0cee43114bd212e3fc9add2a18639605049b95 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 13:37:17 -0800 Subject: [PATCH 050/934] docs: Improve discussion of this_cpu_ptr(), add raw_cpu_ptr() Most of the this_cpu_*() operations may be used in preemptible code, but not this_cpu_ptr(), and for good reasons. Therefore, better explain the reasons and call out raw_cpu_ptr() as an alternative in certain very special cases. Signed-off-by: Paul E. McKenney Cc: Jonathan Corbet Cc: Peter Zijlstra Cc: Mark Rutland Cc: Boqun Feng Cc: Signed-off-by: Boqun Feng --- Documentation/core-api/this_cpu_ops.rst | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/Documentation/core-api/this_cpu_ops.rst b/Documentation/core-api/this_cpu_ops.rst index 91acbcf30e9bd..533ac5dd57507 100644 --- a/Documentation/core-api/this_cpu_ops.rst +++ b/Documentation/core-api/this_cpu_ops.rst @@ -138,12 +138,22 @@ get_cpu/put_cpu sequence requires. No processor number is available. Instead, the offset of the local per cpu area is simply added to the per cpu offset. -Note that this operation is usually used in a code segment when -preemption has been disabled. The pointer is then used to -access local per cpu data in a critical section. When preemption -is re-enabled this pointer is usually no longer useful since it may -no longer point to per cpu data of the current processor. - +Note that this operation can only be used in code segments where +smp_processor_id() may be used, for example, where preemption has been +disabled. The pointer is then used to access local per cpu data in a +critical section. When preemption is re-enabled this pointer is usually +no longer useful since it may no longer point to per cpu data of the +current processor. + +The special cases where it makes sense to obtain a per-CPU pointer in +preemptible code are addressed by raw_cpu_ptr(), but such use cases need +to handle cases where two different CPUs are accessing the same per cpu +variable, which might well be that of a third CPU. These use cases are +typically performance optimizations. For example, SRCU implements a pair +of counters as a pair of per-CPU variables, and rcu_read_lock_nmisafe() +uses raw_cpu_ptr() to get a pointer to some CPU's counter, and uses +atomic_inc_long() to handle migration between the raw_cpu_ptr() and +the atomic_inc_long(). Per cpu variables and offsets ----------------------------- -- GitLab From 21ef2498622197429c3105254587034d93a745b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 16:53:02 -0800 Subject: [PATCH 051/934] rcu: Document self-propagating callbacks This commit documents the fact that a given RCU callback function can repost itself. Reported-by: Jens Axboe Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 475f31deed141..2cd193ed854c9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3107,7 +3107,7 @@ module_param(enable_rcu_lazy, bool, 0444); * critical sections have completed. * * Use this API instead of call_rcu() if you don't want the callback to be - * invoked after very long periods of time, which can happen on systems without + * delayed for very long periods of time, which can happen on systems without * memory pressure and on systems which are lightly loaded or mostly idle. * This function will cause callbacks to be invoked sooner than later at the * expense of extra power. Other than that, this function is identical to, and @@ -3138,6 +3138,12 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * might well execute concurrently with RCU read-side critical sections * that started after call_rcu() was invoked. * + * It is perfectly legal to repost an RCU callback, potentially with + * a different callback function, from within its callback function. + * The specified function will be invoked after another full grace period + * has elapsed. This use case is similar in form to the common practice + * of reposting a timer from within its own handler. + * * RCU read-side critical sections are delimited by rcu_read_lock() * and rcu_read_unlock(), and may be nested. In addition, but only in * v5.0 and later, regions of code across which interrupts, preemption, -- GitLab From 366ba3f7f9ce4924d544b9d2c6514a7b19270953 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 19:35:40 -0800 Subject: [PATCH 052/934] srcu: Point call_srcu() to call_rcu() for detailed memory ordering This commit causes the call_srcu() kernel-doc header to reference that of call_rcu() for detailed memory-ordering guarantees. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/srcutree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b83c74c4dcc0d..f87a9fb6d6bb8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1398,8 +1398,12 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, * read-side critical sections are delimited by srcu_read_lock() and * srcu_read_unlock(), and may be nested. * - * The callback will be invoked from process context, but must nevertheless - * be fast and must not block. + * The callback will be invoked from process context, but with bh + * disabled. The callback function must therefore be fast and must + * not block. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) -- GitLab From 053ca72554df6923d703a1c072f1930e1870b040 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 19:42:42 -0800 Subject: [PATCH 053/934] rcu: Add CONFIG_RCU_LAZY delays to call_rcu() kernel-doc header This commit adds a description of the energy-efficiency delays that call_rcu() can impose, along with a pointer to call_rcu_hurry() for latency-sensitive kernel code. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2cd193ed854c9..780e4b30febbb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3172,6 +3172,13 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * * Implementation of these memory-ordering guarantees is described here: * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. + * + * Specific to call_rcu() (as opposed to the other call_rcu*() functions), + * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many + * seconds before starting the grace period needed by the corresponding + * callback. This delay can significantly improve energy-efficiency + * on low-utilization battery-powered devices. To avoid this delay, + * in latency-sensitive kernel code, use call_rcu_hurry(). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { -- GitLab From 81a208c56ef6a07cfde338603e891cc647b73d21 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 19:50:37 -0800 Subject: [PATCH 054/934] rcu: Clarify RCU_LAZY and RCU_LAZY_DEFAULT_OFF help text This commit wordsmiths the RCU_LAZY and RCU_LAZY_DEFAULT_OFF Kconfig options' help text. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/Kconfig | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b9b6bc55185db..2bb22dac3b5a9 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -323,21 +323,27 @@ config RCU_LAZY depends on RCU_NOCB_CPU default n help - To save power, batch RCU callbacks and flush after delay, memory - pressure, or callback list growing too big. + To save power, batch RCU callbacks and delay starting the + corresponding grace period for multiple seconds. The grace + period will be started after this delay, in case of memory + pressure, or if the corresponding CPU's callback list grows + too large. - Requires rcu_nocbs=all to be set. + These delays happen only on rcu_nocbs CPUs, that is, CPUs + whose callbacks have been offloaded. - Use rcutree.enable_rcu_lazy=0 to turn it off at boot time. + Use the rcutree.enable_rcu_lazy=0 kernel-boot parameter to + globally disable these delays. config RCU_LAZY_DEFAULT_OFF bool "Turn RCU lazy invocation off by default" depends on RCU_LAZY default n help - Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default - off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch - it back on. + Build the kernel with CONFIG_RCU_LAZY=y, but cause the kernel + to boot with these energy-efficiency delays disabled. Use the + rcutree.enable_rcu_lazy=0 kernel-boot parameter to override + the this option at boot time, thus re-enabling these delays. config RCU_DOUBLE_CHECK_CB_TIME bool "RCU callback-batch backup time check" -- GitLab From 73298c7cf1b901809d2a7f3636e4635839ff8033 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Jan 2025 08:52:15 -0800 Subject: [PATCH 055/934] rcu: Remove references to old grace-period-wait primitives The rcu_barrier_sched(), synchronize_sched(), and synchronize_rcu_bh() RCU API members have been gone for many years. This commit therefore removes non-historical instances of them. Reported-by: Joe Perches Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- Documentation/RCU/rcubarrier.rst | 5 +---- include/linux/rcupdate.h | 17 +++++++---------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/Documentation/RCU/rcubarrier.rst b/Documentation/RCU/rcubarrier.rst index 6da7f66da2a80..12a7b059654f7 100644 --- a/Documentation/RCU/rcubarrier.rst +++ b/Documentation/RCU/rcubarrier.rst @@ -329,10 +329,7 @@ Answer: was first added back in 2005. This is because on_each_cpu() disables preemption, which acted as an RCU read-side critical section, thus preventing CPU 0's grace period from completing - until on_each_cpu() had dealt with all of the CPUs. However, - with the advent of preemptible RCU, rcu_barrier() no longer - waited on nonpreemptible regions of code in preemptible kernels, - that being the job of the new rcu_barrier_sched() function. + until on_each_cpu() had dealt with all of the CPUs. However, with the RCU flavor consolidation around v4.20, this possibility was once again ruled out, because the consolidated diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd8..3bb554723074d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -806,11 +806,9 @@ do { \ * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * - * In v5.0 and later kernels, synchronize_rcu() and call_rcu() also - * wait for regions of code with preemption disabled, including regions of - * code with interrupts or softirqs disabled. In pre-v5.0 kernels, which - * define synchronize_sched(), only code enclosed within rcu_read_lock() - * and rcu_read_unlock() are guaranteed to be waited for. + * Both synchronize_rcu() and call_rcu() also wait for regions of code + * with preemption disabled, including regions of code with interrupts or + * softirqs disabled. * * Note, however, that RCU callbacks are permitted to run concurrently * with new RCU read-side critical sections. One way that this can happen @@ -865,11 +863,10 @@ static __always_inline void rcu_read_lock(void) * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * In almost all situations, rcu_read_unlock() is immune from deadlock. - * In recent kernels that have consolidated synchronize_sched() and - * synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity - * also extends to the scheduler's runqueue and priority-inheritance - * spinlocks, courtesy of the quiescent-state deferral that is carried - * out when rcu_read_unlock() is invoked with interrupts disabled. + * This deadlock immunity also extends to the scheduler's runqueue + * and priority-inheritance spinlocks, courtesy of the quiescent-state + * deferral that is carried out when rcu_read_unlock() is invoked with + * interrupts disabled. * * See rcu_read_lock() for more information. */ -- GitLab From a3e8162105e8266b94bb25d3d7e48645da4b0c26 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Oct 2024 08:39:53 -0700 Subject: [PATCH 056/934] rcu: Split rcu_report_exp_cpu_mult() mask parameter and use for tracing This commit renames the rcu_report_exp_cpu_mult() function from "mask" to "mask_in" and introduced a "mask" local variable to better support upcoming event-tracing additions. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Signed-off-by: Boqun Feng --- kernel/rcu/tree_exp.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 77efed89c79e3..8d4895c854c59 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -230,17 +230,19 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) * specified leaf rcu_node structure, which is acquired by the caller. */ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags, - unsigned long mask, bool wake) + unsigned long mask_in, bool wake) __releases(rnp->lock) { int cpu; + unsigned long mask; struct rcu_data *rdp; raw_lockdep_assert_held_rcu_node(rnp); - if (!(rnp->expmask & mask)) { + if (!(rnp->expmask & mask_in)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } + mask = mask_in & rnp->expmask; WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); -- GitLab From 764f6a81103e83dc552237f77cc769dd550d8e01 Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Sun, 10 Nov 2024 14:47:47 +0000 Subject: [PATCH 057/934] rcu: Remove READ_ONCE() for rdp->gpwrap access in __note_gp_changes() There is one access to the per-CPU rdp->gpwrap field in the __note_gp_changes() function that does not use READ_ONCE(), but all other accesses do use READ_ONCE(). When using the 8*TREE03 and CONFIG_NR_CPUS=8 configuration, KCSAN found no data races at that point. This is because all calls to __note_gp_changes() hold rnp->lock, which excludes writes to the rdp->gpwrap fields for all CPUs associated with that same leaf rcu_node structure. This commit therefore removes READ_ONCE() from rdp->gpwrap accesses within the __note_gp_changes() function. Signed-off-by: Zilin Guan Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 475f31deed141..e4c0ce600b2be 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1254,7 +1254,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { if (!offloaded) ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ rdp->core_needs_qs = false; @@ -1268,7 +1268,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Now handle the beginnings of any new-to-this-CPU grace periods. */ if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1283,7 +1283,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); - if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap) WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); -- GitLab From 83179cd67846fae0e6aea5848c07faaa5d89a1de Mon Sep 17 00:00:00 2001 From: Liao Chang Date: Fri, 24 Jan 2025 09:38:26 +0000 Subject: [PATCH 058/934] uprobes: Remove the spinlock within handle_singlestep() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces a flag to track TIF_SIGPENDING is suppress temporarily during the uprobe single-step. Upon uprobe singlestep is handled and the flag is confirmed, it could resume the TIF_SIGPENDING directly without acquiring the siglock in most case, then reducing contention and improving overall performance. I've use the script developed by Andrii in [1] to run benchmark. The CPU used was Kunpeng916 (Hi1616), 4 NUMA nodes, 64 cores@2.4GHz running the kernel on next tree + the optimization for get_xol_insn_slot() [2]. before-opt ---------- uprobe-nop ( 1 cpus): 0.907 ± 0.003M/s ( 0.907M/s/cpu) uprobe-nop ( 2 cpus): 1.676 ± 0.008M/s ( 0.838M/s/cpu) uprobe-nop ( 4 cpus): 3.210 ± 0.003M/s ( 0.802M/s/cpu) uprobe-nop ( 8 cpus): 4.457 ± 0.003M/s ( 0.557M/s/cpu) uprobe-nop (16 cpus): 3.724 ± 0.011M/s ( 0.233M/s/cpu) uprobe-nop (32 cpus): 2.761 ± 0.003M/s ( 0.086M/s/cpu) uprobe-nop (64 cpus): 1.293 ± 0.015M/s ( 0.020M/s/cpu) uprobe-push ( 1 cpus): 0.883 ± 0.001M/s ( 0.883M/s/cpu) uprobe-push ( 2 cpus): 1.642 ± 0.005M/s ( 0.821M/s/cpu) uprobe-push ( 4 cpus): 3.086 ± 0.002M/s ( 0.771M/s/cpu) uprobe-push ( 8 cpus): 3.390 ± 0.003M/s ( 0.424M/s/cpu) uprobe-push (16 cpus): 2.652 ± 0.005M/s ( 0.166M/s/cpu) uprobe-push (32 cpus): 2.713 ± 0.005M/s ( 0.085M/s/cpu) uprobe-push (64 cpus): 1.313 ± 0.009M/s ( 0.021M/s/cpu) uprobe-ret ( 1 cpus): 1.774 ± 0.000M/s ( 1.774M/s/cpu) uprobe-ret ( 2 cpus): 3.350 ± 0.001M/s ( 1.675M/s/cpu) uprobe-ret ( 4 cpus): 6.604 ± 0.000M/s ( 1.651M/s/cpu) uprobe-ret ( 8 cpus): 6.706 ± 0.005M/s ( 0.838M/s/cpu) uprobe-ret (16 cpus): 5.231 ± 0.001M/s ( 0.327M/s/cpu) uprobe-ret (32 cpus): 5.743 ± 0.003M/s ( 0.179M/s/cpu) uprobe-ret (64 cpus): 4.726 ± 0.016M/s ( 0.074M/s/cpu) after-opt --------- uprobe-nop ( 1 cpus): 0.985 ± 0.002M/s ( 0.985M/s/cpu) uprobe-nop ( 2 cpus): 1.773 ± 0.005M/s ( 0.887M/s/cpu) uprobe-nop ( 4 cpus): 3.304 ± 0.001M/s ( 0.826M/s/cpu) uprobe-nop ( 8 cpus): 5.328 ± 0.002M/s ( 0.666M/s/cpu) uprobe-nop (16 cpus): 6.475 ± 0.002M/s ( 0.405M/s/cpu) uprobe-nop (32 cpus): 4.831 ± 0.082M/s ( 0.151M/s/cpu) uprobe-nop (64 cpus): 2.564 ± 0.053M/s ( 0.040M/s/cpu) uprobe-push ( 1 cpus): 0.964 ± 0.001M/s ( 0.964M/s/cpu) uprobe-push ( 2 cpus): 1.766 ± 0.002M/s ( 0.883M/s/cpu) uprobe-push ( 4 cpus): 3.290 ± 0.009M/s ( 0.823M/s/cpu) uprobe-push ( 8 cpus): 4.670 ± 0.002M/s ( 0.584M/s/cpu) uprobe-push (16 cpus): 5.197 ± 0.004M/s ( 0.325M/s/cpu) uprobe-push (32 cpus): 5.068 ± 0.161M/s ( 0.158M/s/cpu) uprobe-push (64 cpus): 2.605 ± 0.026M/s ( 0.041M/s/cpu) uprobe-ret ( 1 cpus): 1.833 ± 0.001M/s ( 1.833M/s/cpu) uprobe-ret ( 2 cpus): 3.384 ± 0.003M/s ( 1.692M/s/cpu) uprobe-ret ( 4 cpus): 6.677 ± 0.004M/s ( 1.669M/s/cpu) uprobe-ret ( 8 cpus): 6.854 ± 0.005M/s ( 0.857M/s/cpu) uprobe-ret (16 cpus): 6.508 ± 0.006M/s ( 0.407M/s/cpu) uprobe-ret (32 cpus): 5.793 ± 0.009M/s ( 0.181M/s/cpu) uprobe-ret (64 cpus): 4.743 ± 0.016M/s ( 0.074M/s/cpu) Above benchmark results demonstrates a obivious improvement in the scalability of trig-uprobe-nop and trig-uprobe-push, the peak throughput of which are from 4.5M/s to 6.4M/s and 3.3M/s to 5.1M/s individually. [1] https://lore.kernel.org/all/20240731214256.3588718-1-andrii@kernel.org [2] https://lore.kernel.org/all/20240727094405.1362496-1-liaochang1@huawei.com Acked-by: Masami Hiramatsu (Google) Acked-by: Oleg Nesterov Signed-off-by: Liao Chang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250124093826.2123675-3-liaochang1@huawei.com --- include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b1df7d792fa16..a40efdda9052b 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -143,6 +143,7 @@ struct uprobe_task { struct uprobe *active_uprobe; unsigned long xol_vaddr; + bool signal_denied; struct arch_uprobe *auprobe; }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 33bd6083f7c40..870f69780900e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2302,6 +2302,7 @@ bool uprobe_deny_signal(void) WARN_ON_ONCE(utask->state != UTASK_SSTEP); if (task_sigpending(t)) { + utask->signal_denied = true; clear_tsk_thread_flag(t, TIF_SIGPENDING); if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { @@ -2735,9 +2736,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) utask->state = UTASK_RUNNING; xol_free_insn_slot(utask); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* see uprobe_deny_signal() */ - spin_unlock_irq(¤t->sighand->siglock); + if (utask->signal_denied) { + set_thread_flag(TIF_SIGPENDING); + utask->signal_denied = false; + } if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); -- GitLab From 314dfe10576912e1d786b13c5d4eee8c51b63caa Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 21 Jan 2025 07:23:00 -0800 Subject: [PATCH 059/934] perf/x86/intel: Apply static call for drain_pebs The x86_pmu_drain_pebs static call was introduced in commit 7c9903c9bf71 ("x86/perf, static_call: Optimize x86_pmu methods"), but it's not really used to replace the old method. Apply the static call for drain_pebs. Fixes: 7c9903c9bf71 ("x86/perf, static_call: Optimize x86_pmu methods") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250121152303.3128733-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 2 +- arch/x86/events/intel/ds.c | 2 +- arch/x86/events/perf_event.h | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7601196d1d18e..2acea83526c63 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3076,7 +3076,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) handled++; x86_pmu_handle_guest_pebs(regs, &data); - x86_pmu.drain_pebs(regs, &data); + static_call(x86_pmu_drain_pebs)(regs, &data); status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; /* diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ba74e11983280..322963b02a912 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -957,7 +957,7 @@ static inline void intel_pmu_drain_pebs_buffer(void) { struct perf_sample_data data; - x86_pmu.drain_pebs(NULL, &data); + static_call(x86_pmu_drain_pebs)(NULL, &data); } /* diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 31c2771545a6c..084e9196b4582 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1107,6 +1107,7 @@ extern struct x86_pmu x86_pmu __read_mostly; DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); +DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { -- GitLab From f9bdf1f953392c9edd69a7f884f78c0390127029 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 21 Jan 2025 07:23:01 -0800 Subject: [PATCH 060/934] perf/x86/intel: Avoid disable PMU if !cpuc->enabled in sample read The WARN_ON(this_cpu_read(cpu_hw_events.enabled)) in the intel_pmu_save_and_restart_reload() is triggered, when sampling read topdown events. In a NMI handler, the cpu_hw_events.enabled is set and used to indicate the status of core PMU. The generic pmu->pmu_disable_count, updated in the perf_pmu_disable/enable pair, is not touched. However, the perf_pmu_disable/enable pair is invoked when sampling read in a NMI handler. The cpuc->enabled is mistakenly set by the perf_pmu_enable(). Avoid disabling PMU if the core PMU is already disabled. Merge the logic together. Fixes: 7b2c05a15d29 ("perf/x86/intel: Generic support for hardware TopDown metrics") Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250121152303.3128733-2-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 41 ++++++++++++++++++++---------------- arch/x86/events/intel/ds.c | 11 +--------- arch/x86/events/perf_event.h | 2 +- 3 files changed, 25 insertions(+), 29 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2acea83526c63..1ccc961f81821 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2785,28 +2785,33 @@ static u64 icl_update_topdown_event(struct perf_event *event) DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); -static void intel_pmu_read_topdown_event(struct perf_event *event) +static void intel_pmu_read_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + bool pmu_enabled = cpuc->enabled; - /* Only need to call update_topdown_event() once for group read. */ - if ((cpuc->txn_flags & PERF_PMU_TXN_READ) && - !is_slots_event(event)) - return; + /* Only need to call update_topdown_event() once for group read. */ + if (is_metric_event(event) && (cpuc->txn_flags & PERF_PMU_TXN_READ)) + return; - perf_pmu_disable(event->pmu); - static_call(intel_pmu_update_topdown_event)(event); - perf_pmu_enable(event->pmu); -} + cpuc->enabled = 0; + if (pmu_enabled) + intel_pmu_disable_all(); -static void intel_pmu_read_event(struct perf_event *event) -{ - if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) - intel_pmu_auto_reload_read(event); - else if (is_topdown_count(event)) - intel_pmu_read_topdown_event(event); - else - x86_perf_event_update(event); + if (is_topdown_event(event)) + static_call(intel_pmu_update_topdown_event)(event); + else + intel_pmu_drain_pebs_buffer(); + + cpuc->enabled = pmu_enabled; + if (pmu_enabled) + intel_pmu_enable_all(0); + + return; + } + + x86_perf_event_update(event); } static void intel_pmu_enable_fixed(struct perf_event *event) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 322963b02a912..eb14b46423e58 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -953,7 +953,7 @@ unlock: return 1; } -static inline void intel_pmu_drain_pebs_buffer(void) +void intel_pmu_drain_pebs_buffer(void) { struct perf_sample_data data; @@ -2094,15 +2094,6 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) return NULL; } -void intel_pmu_auto_reload_read(struct perf_event *event) -{ - WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)); - - perf_pmu_disable(event->pmu); - intel_pmu_drain_pebs_buffer(); - perf_pmu_enable(event->pmu); -} - /* * Special variant of intel_pmu_save_and_restart() for auto-reload. */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 084e9196b4582..536a112f6353e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1644,7 +1644,7 @@ void intel_pmu_pebs_disable_all(void); void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); -void intel_pmu_auto_reload_read(struct perf_event *event); +void intel_pmu_drain_pebs_buffer(void); void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); -- GitLab From 8ce939a0fa194939cc1f92dbd8bc1a7806e7d40a Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 21 Jan 2025 07:23:02 -0800 Subject: [PATCH 061/934] perf: Avoid the read if the count is already updated The event may have been updated in the PMU-specific implementation, e.g., Intel PEBS counters snapshotting. The common code should not read and overwrite the value. The PERF_SAMPLE_READ in the data->sample_type can be used to detect whether the PMU-specific value is available. If yes, avoid the pmu->read() in the common code. Add a new flag, skip_read, to track the case. Factor out a perf_pmu_read() to clean up the code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250121152303.3128733-3-kan.liang@linux.intel.com --- include/linux/perf_event.h | 8 +++++++- kernel/events/core.c | 33 ++++++++++++++++----------------- kernel/events/ring_buffer.c | 1 + 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8333f132f4a96..2d07bc1193f3a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1062,7 +1062,13 @@ struct perf_output_handle { struct perf_buffer *rb; unsigned long wakeup; unsigned long size; - u64 aux_flags; + union { + u64 flags; /* perf_output*() */ + u64 aux_flags; /* perf_aux_output*() */ + struct { + u64 skip_read : 1; + }; + }; union { void *addr; unsigned long head; diff --git a/kernel/events/core.c b/kernel/events/core.c index bcb09e011e9e1..0f8c559907830 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1191,6 +1191,12 @@ static void perf_assert_pmu_disabled(struct pmu *pmu) WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); } +static inline void perf_pmu_read(struct perf_event *event) +{ + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); +} + static void get_ctx(struct perf_event_context *ctx) { refcount_inc(&ctx->refcount); @@ -3473,8 +3479,7 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); + perf_pmu_read(event); perf_event_update_time(event); @@ -4618,15 +4623,8 @@ static void __perf_event_read(void *info) pmu->read(event); - for_each_sibling_event(sub, event) { - if (sub->state == PERF_EVENT_STATE_ACTIVE) { - /* - * Use sibling's PMU rather than @event's since - * sibling could be on different (eg: software) PMU. - */ - sub->pmu->read(sub); - } - } + for_each_sibling_event(sub, event) + perf_pmu_read(sub); data->ret = pmu->commit_txn(pmu); @@ -7444,9 +7442,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if ((leader != event) && - (leader->state == PERF_EVENT_STATE_ACTIVE)) - leader->pmu->read(leader); + if ((leader != event) && !handle->skip_read) + perf_pmu_read(leader); values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) @@ -7459,9 +7456,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, for_each_sibling_event(sub, leader) { n = 0; - if ((sub != event) && - (sub->state == PERF_EVENT_STATE_ACTIVE)) - sub->pmu->read(sub); + if ((sub != event) && !handle->skip_read) + perf_pmu_read(sub); values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) @@ -7520,6 +7516,9 @@ void perf_output_sample(struct perf_output_handle *handle, { u64 sample_type = data->type; + if (data->sample_flags & PERF_SAMPLE_READ) + handle->skip_read = 1; + perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 180509132d4b6..59a52b1a1f789 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -185,6 +185,7 @@ __perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; + handle->flags = 0; have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { -- GitLab From e02e9b0374c378aab016ae8ace60d9d98ab8caa6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 21 Jan 2025 07:23:03 -0800 Subject: [PATCH 062/934] perf/x86/intel: Support PEBS counters snapshotting The counters snapshotting is a new adaptive PEBS extension, which can capture programmable counters, fixed-function counters, and performance metrics in a PEBS record. The feature is available in the PEBS format V6. The target counters can be configured in the new fields of MSR_PEBS_CFG. Then the PEBS HW will generate the bit mask of counters (Counters Group Header) followed by the content of all the requested counters into a PEBS record. The current Linux perf sample read feature can read all events in the group when any event in the group is overflowed. But the rdpmc in the NMI/overflow handler has a small gap from overflow. Also, there is some overhead for each rdpmc read. The counters snapshotting feature can be used as an accurate and low-overhead replacement. Extend intel_update_topdown_event() to accept the value from PEBS records. Add a new PEBS_CNTR flag to indicate a sample read group that utilizes the counters snapshotting feature. When the group is scheduled, the PEBS configure can be updated accordingly. To prevent the case that a PEBS record value might be in the past relative to what is already in the event, perf always stops the PMU and drains the PEBS buffer before updating the corresponding event->count. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250121152303.3128733-4-kan.liang@linux.intel.com --- arch/x86/events/core.c | 13 ++ arch/x86/events/intel/core.c | 75 ++++++++--- arch/x86/events/intel/ds.c | 191 +++++++++++++++++++++++++++-- arch/x86/events/perf_event.h | 13 ++ arch/x86/events/perf_event_flags.h | 2 +- arch/x86/include/asm/perf_event.h | 15 +++ 6 files changed, 284 insertions(+), 25 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 8f218ac0d445c..7b6430e5a77b6 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -94,6 +94,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); +DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup); + /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. @@ -1298,6 +1300,15 @@ static void x86_pmu_enable(struct pmu *pmu) if (cpuc->n_added) { int n_running = cpuc->n_events - cpuc->n_added; + + /* + * The late setup (after counters are scheduled) + * is required for some cases, e.g., PEBS counters + * snapshotting. Because an accurate counter index + * is needed. + */ + static_call_cond(x86_pmu_late_setup)(); + /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() @@ -2035,6 +2046,8 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); static_call_update(x86_pmu_filter, x86_pmu.filter); + + static_call_update(x86_pmu_late_setup, x86_pmu.late_setup); } static void _x86_pmu_read(struct perf_event *event) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 1ccc961f81821..8ce915abcd642 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2720,7 +2720,7 @@ static void update_saved_topdown_regs(struct perf_event *event, u64 slots, * modify by a NMI. PMU has to be disabled before calling this function. */ -static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) +static u64 intel_update_topdown_event(struct perf_event *event, int metric_end, u64 *val) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *other; @@ -2728,13 +2728,24 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) bool reset = true; int idx; - /* read Fixed counter 3 */ - rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); - if (!slots) - return 0; + if (!val) { + /* read Fixed counter 3 */ + rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); + if (!slots) + return 0; - /* read PERF_METRICS */ - rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + /* read PERF_METRICS */ + rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + } else { + slots = val[0]; + metrics = val[1]; + /* + * Don't reset the PERF_METRICS and Fixed counter 3 + * for each PEBS record read. Utilize the RDPMC metrics + * clear mode. + */ + reset = false; + } for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { if (!is_topdown_idx(idx)) @@ -2777,17 +2788,19 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) return slots; } -static u64 icl_update_topdown_event(struct perf_event *event) +static u64 icl_update_topdown_event(struct perf_event *event, u64 *val) { return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE + - x86_pmu.num_topdown_events - 1); + x86_pmu.num_topdown_events - 1, + val); } -DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); +DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update); static void intel_pmu_read_event(struct perf_event *event) { - if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN)) { + if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN) || + is_pebs_counter_event_group(event)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); bool pmu_enabled = cpuc->enabled; @@ -2799,8 +2812,12 @@ static void intel_pmu_read_event(struct perf_event *event) if (pmu_enabled) intel_pmu_disable_all(); - if (is_topdown_event(event)) - static_call(intel_pmu_update_topdown_event)(event); + /* + * If the PEBS counters snapshotting is enabled, + * the topdown event is available in PEBS records. + */ + if (is_topdown_event(event) && !is_pebs_counter_event_group(event)) + static_call(intel_pmu_update_topdown_event)(event, NULL); else intel_pmu_drain_pebs_buffer(); @@ -2943,7 +2960,7 @@ static int intel_pmu_set_period(struct perf_event *event) static u64 intel_pmu_update(struct perf_event *event) { if (unlikely(is_topdown_count(event))) - return static_call(intel_pmu_update_topdown_event)(event); + return static_call(intel_pmu_update_topdown_event)(event, NULL); return x86_perf_event_update(event); } @@ -3109,7 +3126,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { handled++; - static_call(intel_pmu_update_topdown_event)(NULL); + static_call(intel_pmu_update_topdown_event)(NULL, NULL); } /* @@ -3127,6 +3144,27 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) if (!test_bit(bit, cpuc->active_mask)) continue; + /* + * There may be unprocessed PEBS records in the PEBS buffer, + * which still stores the previous values. + * Process those records first before handling the latest value. + * For example, + * A is a regular counter + * B is a PEBS event which reads A + * C is a PEBS event + * + * The following can happen: + * B-assist A=1 + * C A=2 + * B-assist A=3 + * A-overflow-PMI A=4 + * C-assist-PMI (PEBS buffer) A=5 + * + * The PEBS buffer has to be drained before handling the A-PMI + */ + if (is_pebs_counter_event_group(event)) + x86_pmu.drain_pebs(regs, &data); + if (!intel_pmu_save_and_restart(event)) continue; @@ -4074,6 +4112,13 @@ static int intel_pmu_hw_config(struct perf_event *event) event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT; } + if ((event->attr.sample_type & PERF_SAMPLE_READ) && + (x86_pmu.intel_cap.pebs_format >= 6) && + x86_pmu.intel_cap.pebs_baseline && + is_sampling_event(event) && + event->attr.precise_ip) + event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; + if ((event->attr.type == PERF_TYPE_HARDWARE) || (event->attr.type == PERF_TYPE_HW_CACHE)) return 0; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index eb14b46423e58..13a78a8a2780d 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1294,6 +1294,19 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) ds->pebs_interrupt_threshold = threshold; } +#define PEBS_DATACFG_CNTRS(x) \ + ((x >> PEBS_DATACFG_CNTR_SHIFT) & PEBS_DATACFG_CNTR_MASK) + +#define PEBS_DATACFG_CNTR_BIT(x) \ + (((1ULL << x) & PEBS_DATACFG_CNTR_MASK) << PEBS_DATACFG_CNTR_SHIFT) + +#define PEBS_DATACFG_FIX(x) \ + ((x >> PEBS_DATACFG_FIX_SHIFT) & PEBS_DATACFG_FIX_MASK) + +#define PEBS_DATACFG_FIX_BIT(x) \ + (((1ULL << (x)) & PEBS_DATACFG_FIX_MASK) \ + << PEBS_DATACFG_FIX_SHIFT) + static void adaptive_pebs_record_size_update(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1308,10 +1321,58 @@ static void adaptive_pebs_record_size_update(void) sz += sizeof(struct pebs_xmm); if (pebs_data_cfg & PEBS_DATACFG_LBRS) sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry); + if (pebs_data_cfg & (PEBS_DATACFG_METRICS | PEBS_DATACFG_CNTR)) { + sz += sizeof(struct pebs_cntr_header); + + /* Metrics base and Metrics Data */ + if (pebs_data_cfg & PEBS_DATACFG_METRICS) + sz += 2 * sizeof(u64); + + if (pebs_data_cfg & PEBS_DATACFG_CNTR) { + sz += (hweight64(PEBS_DATACFG_CNTRS(pebs_data_cfg)) + + hweight64(PEBS_DATACFG_FIX(pebs_data_cfg))) * + sizeof(u64); + } + } cpuc->pebs_record_size = sz; } +static void __intel_pmu_pebs_update_cfg(struct perf_event *event, + int idx, u64 *pebs_data_cfg) +{ + if (is_metric_event(event)) { + *pebs_data_cfg |= PEBS_DATACFG_METRICS; + return; + } + + *pebs_data_cfg |= PEBS_DATACFG_CNTR; + + if (idx >= INTEL_PMC_IDX_FIXED) + *pebs_data_cfg |= PEBS_DATACFG_FIX_BIT(idx - INTEL_PMC_IDX_FIXED); + else + *pebs_data_cfg |= PEBS_DATACFG_CNTR_BIT(idx); +} + + +static void intel_pmu_late_setup(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event; + u64 pebs_data_cfg = 0; + int i; + + for (i = 0; i < cpuc->n_events; i++) { + event = cpuc->event_list[i]; + if (!is_pebs_counter_event_group(event)) + continue; + __intel_pmu_pebs_update_cfg(event, cpuc->assign[i], &pebs_data_cfg); + } + + if (pebs_data_cfg & ~cpuc->pebs_data_cfg) + cpuc->pebs_data_cfg |= pebs_data_cfg | PEBS_UPDATE_DS_SW; +} + #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_WEIGHT_TYPE | \ @@ -1914,12 +1975,89 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs, #endif } +static void intel_perf_event_update_pmc(struct perf_event *event, u64 pmc) +{ + int shift = 64 - x86_pmu.cntval_bits; + struct hw_perf_event *hwc; + u64 delta, prev_pmc; + + /* + * A recorded counter may not have an assigned event in the + * following cases. The value should be dropped. + * - An event is deleted. There is still an active PEBS event. + * The PEBS record doesn't shrink on pmu::del(). + * If the counter of the deleted event once occurred in a PEBS + * record, PEBS still records the counter until the counter is + * reassigned. + * - An event is stopped for some reason, e.g., throttled. + * During this period, another event is added and takes the + * counter of the stopped event. The stopped event is assigned + * to another new and uninitialized counter, since the + * x86_pmu_start(RELOAD) is not invoked for a stopped event. + * The PEBS__DATA_CFG is updated regardless of the event state. + * The uninitialized counter can be recorded in a PEBS record. + * But the cpuc->events[uninitialized_counter] is always NULL, + * because the event is stopped. The uninitialized value is + * safely dropped. + */ + if (!event) + return; + + hwc = &event->hw; + prev_pmc = local64_read(&hwc->prev_count); + + /* Only update the count when the PMU is disabled */ + WARN_ON(this_cpu_read(cpu_hw_events.enabled)); + local64_set(&hwc->prev_count, pmc); + + delta = (pmc << shift) - (prev_pmc << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); +} + +static inline void __setup_pebs_counter_group(struct cpu_hw_events *cpuc, + struct perf_event *event, + struct pebs_cntr_header *cntr, + void *next_record) +{ + int bit; + + for_each_set_bit(bit, (unsigned long *)&cntr->cntr, INTEL_PMC_MAX_GENERIC) { + intel_perf_event_update_pmc(cpuc->events[bit], *(u64 *)next_record); + next_record += sizeof(u64); + } + + for_each_set_bit(bit, (unsigned long *)&cntr->fixed, INTEL_PMC_MAX_FIXED) { + /* The slots event will be handled with perf_metric later */ + if ((cntr->metrics == INTEL_CNTR_METRICS) && + (bit + INTEL_PMC_IDX_FIXED == INTEL_PMC_IDX_FIXED_SLOTS)) { + next_record += sizeof(u64); + continue; + } + intel_perf_event_update_pmc(cpuc->events[bit + INTEL_PMC_IDX_FIXED], + *(u64 *)next_record); + next_record += sizeof(u64); + } + + /* HW will reload the value right after the overflow. */ + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) + local64_set(&event->hw.prev_count, (u64)-event->hw.sample_period); + + if (cntr->metrics == INTEL_CNTR_METRICS) { + static_call(intel_pmu_update_topdown_event) + (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS], + (u64 *)next_record); + next_record += 2 * sizeof(u64); + } +} + #define PEBS_LATENCY_MASK 0xffff /* * With adaptive PEBS the layout depends on what fields are configured. */ - static void setup_pebs_adaptive_sample_data(struct perf_event *event, struct pt_regs *iregs, void *__pebs, struct perf_sample_data *data, @@ -2049,6 +2187,28 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } } + if (format_group & (PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS)) { + struct pebs_cntr_header *cntr = next_record; + unsigned int nr; + + next_record += sizeof(struct pebs_cntr_header); + /* + * The PEBS_DATA_CFG is a global register, which is the + * superset configuration for all PEBS events. + * For the PEBS record of non-sample-read group, ignore + * the counter snapshot fields. + */ + if (is_pebs_counter_event_group(event)) { + __setup_pebs_counter_group(cpuc, event, cntr, next_record); + data->sample_flags |= PERF_SAMPLE_READ; + } + + nr = hweight32(cntr->cntr) + hweight32(cntr->fixed); + if (cntr->metrics == INTEL_CNTR_METRICS) + nr += 2; + next_record += nr * sizeof(u64); + } + WARN_ONCE(next_record != __pebs + basic->format_size, "PEBS record size %u, expected %llu, config %llx\n", basic->format_size, @@ -2202,13 +2362,21 @@ __intel_pmu_pebs_last_event(struct perf_event *event, } if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - /* - * Now, auto-reload is only enabled in fixed period mode. - * The reload value is always hwc->sample_period. - * May need to change it, if auto-reload is enabled in - * freq mode later. - */ - intel_pmu_save_and_restart_reload(event, count); + if ((is_pebs_counter_event_group(event))) { + /* + * The value of each sample has been updated when setup + * the corresponding sample data. + */ + perf_event_update_userpage(event); + } else { + /* + * Now, auto-reload is only enabled in fixed period mode. + * The reload value is always hwc->sample_period. + * May need to change it, if auto-reload is enabled in + * freq mode later. + */ + intel_pmu_save_and_restart_reload(event, count); + } } else intel_pmu_save_and_restart(event); } @@ -2543,6 +2711,11 @@ void __init intel_ds_init(void) break; case 6: + if (x86_pmu.intel_cap.pebs_baseline) { + x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; + x86_pmu.late_setup = intel_pmu_late_setup; + } + fallthrough; case 5: x86_pmu.pebs_ept = 1; fallthrough; @@ -2567,7 +2740,7 @@ void __init intel_ds_init(void) PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); } - pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); + pr_cont("PEBS fmt%d%c%s, ", format, pebs_type, pebs_qual); if (!is_hybrid() && x86_pmu.intel_cap.pebs_output_pt_available) { pr_cont("PEBS-via-PT, "); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 536a112f6353e..a698e6484b3b6 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -115,6 +115,11 @@ static inline bool is_branch_counters_group(struct perf_event *event) return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS; } +static inline bool is_pebs_counter_event_group(struct perf_event *event) +{ + return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ @@ -800,6 +805,7 @@ struct x86_pmu { u64 (*update)(struct perf_event *event); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); + void (*late_setup)(void); unsigned eventsel; unsigned perfctr; unsigned fixedctr; @@ -1108,6 +1114,7 @@ extern struct x86_pmu x86_pmu __read_mostly; DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); +DECLARE_STATIC_CALL(x86_pmu_late_setup, *x86_pmu.late_setup); static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { @@ -1149,6 +1156,12 @@ extern u64 __read_mostly hw_cache_extra_regs u64 x86_perf_event_update(struct perf_event *event); +static inline u64 intel_pmu_topdown_event_update(struct perf_event *event, u64 *val) +{ + return x86_perf_event_update(event); +} +DECLARE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update); + static inline unsigned int x86_pmu_config_addr(int index) { return x86_pmu.eventsel + (x86_pmu.addr_offset ? diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index 6c977c19f2cd7..1d9e385649b57 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -9,7 +9,7 @@ PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ - /* 0x00080 */ +PERF_ARCH(PEBS_CNTR, 0x00080) /* PEBS counters snapshot */ PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index ee5581768fea7..73b104049f8cf 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -141,6 +141,12 @@ #define PEBS_DATACFG_XMMS BIT_ULL(2) #define PEBS_DATACFG_LBRS BIT_ULL(3) #define PEBS_DATACFG_LBR_SHIFT 24 +#define PEBS_DATACFG_CNTR BIT_ULL(4) +#define PEBS_DATACFG_CNTR_SHIFT 32 +#define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0) +#define PEBS_DATACFG_FIX_SHIFT 48 +#define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0) +#define PEBS_DATACFG_METRICS BIT_ULL(5) /* Steal the highest bit of pebs_data_cfg for SW usage */ #define PEBS_UPDATE_DS_SW BIT_ULL(63) @@ -460,6 +466,15 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; +struct pebs_cntr_header { + u32 cntr; + u32 fixed; + u32 metrics; + u32 reserved; +}; + +#define INTEL_CNTR_METRICS 0x3 + /* * AMD Extended Performance Monitoring and Debug cpuid feature detection */ -- GitLab From b14ff274e8aa5517ff86c94d682bf26bf8b5dcc8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:47 +0100 Subject: [PATCH 063/934] slab, rcu: move TINY_RCU variant of kvfree_rcu() to SLAB Following the move of TREE_RCU implementation, let's move also the TINY_RCU one for consistency and subsequent refactoring. For simplicity, remove the separate inline __kvfree_call_rcu() as TINY_RCU is not meant for high-performance hardware anyway. Declare kvfree_call_rcu() in rcupdate.h to avoid header dependency issues. Also move the kvfree_rcu_barrier() declaration to slab.h Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 5 +++++ include/linux/rcutiny.h | 36 ------------------------------------ include/linux/rcutree.h | 3 --- include/linux/slab.h | 14 ++++++++++++++ kernel/rcu/tiny.c | 11 ----------- mm/slab_common.c | 19 +++++++++++++++++++ 6 files changed, 38 insertions(+), 50 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd8..3f70d1c814442 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1082,6 +1082,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) +/* + * In mm/slab_common.c, no suitable header to include here. + */ +void kvfree_call_rcu(struct rcu_head *head, void *ptr); + #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index fe42315f667fc..f519cd6802286 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -90,41 +90,6 @@ static inline void synchronize_rcu_expedited(void) synchronize_rcu(); } -/* - * Add one more declaration of kvfree() here. It is - * not so straight forward to just include - * where it is defined due to getting many compile - * errors caused by that include. - */ -extern void kvfree(const void *addr); - -static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - if (head) { - call_rcu(head, (rcu_callback_t) ((void *) head - ptr)); - return; - } - - // kvfree_rcu(one_arg) call. - might_sleep(); - synchronize_rcu(); - kvfree(ptr); -} - -static inline void kvfree_rcu_barrier(void) -{ - rcu_barrier(); -} - -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr); -#else -static inline void kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - __kvfree_call_rcu(head, ptr); -} -#endif - void rcu_qs(void); static inline void rcu_softirq_qs(void) @@ -164,7 +129,6 @@ static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_eqs(void) { } -static inline void kfree_rcu_scheduler_running(void) { } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 27d86d9127817..dbe77b5fe06ec 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -34,12 +34,9 @@ static inline void rcu_virt_note_context_switch(void) } void synchronize_rcu_expedited(void); -void kvfree_call_rcu(struct rcu_head *head, void *ptr); -void kvfree_rcu_barrier(void); void rcu_barrier(void); void rcu_momentary_eqs(void); -void kfree_rcu_scheduler_running(void); struct rcu_gp_oldstate { unsigned long rgos_norm; diff --git a/include/linux/slab.h b/include/linux/slab.h index 09eedaecf1205..bcc62e5656c35 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1082,6 +1083,19 @@ extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); +#ifdef CONFIG_TINY_RCU +static inline void kvfree_rcu_barrier(void) +{ + rcu_barrier(); +} + +static inline void kfree_rcu_scheduler_running(void) { } +#else +void kvfree_rcu_barrier(void); + +void kfree_rcu_scheduler_running(void); +#endif + /** * kmalloc_size_roundup - Report allocation bucket size for the given size * diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4b3f319114650..0ec27093d0e14 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -246,17 +246,6 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - if (head) - kasan_record_aux_stack(ptr); - - __kvfree_call_rcu(head, ptr); -} -EXPORT_SYMBOL_GPL(kvfree_call_rcu); -#endif - void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/mm/slab_common.c b/mm/slab_common.c index 4030907b6b7d8..81a0ce77b11c2 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1284,6 +1284,25 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); +#ifdef CONFIG_TINY_RCU + +void kvfree_call_rcu(struct rcu_head *head, void *ptr) +{ + if (head) { + kasan_record_aux_stack(ptr); + call_rcu(head, (rcu_callback_t) ((void *) head - ptr)); + return; + } + + // kvfree_rcu(one_arg) call. + might_sleep(); + synchronize_rcu(); + kvfree(ptr); +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +#endif + /* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached -- GitLab From 7f4b19ef3129e1f2e1856b3ee475a02c0be34891 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:48 +0100 Subject: [PATCH 064/934] rcu: remove trace_rcu_kvfree_callback Tree RCU does not handle kvfree_rcu() by queueing individual objects by call_rcu() anymore, thus the tracepoint and associated __is_kvfree_rcu_offset() check is dead code now. Remove it. Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/trace/events/rcu.h | 34 ---------------------------------- kernel/rcu/tree.c | 9 ++------- 2 files changed, 2 insertions(+), 41 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index e81431deaa50b..ac3b28b8939b7 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -560,40 +560,6 @@ TRACE_EVENT_RCU(rcu_segcb_stats, ); -/* - * Tracepoint for the registration of a single RCU callback of the special - * kvfree() form. The first argument is the RCU type, the second argument - * is a pointer to the RCU callback, the third argument is the offset - * of the callback within the enclosing RCU-protected data structure, - * the fourth argument is the number of lazy callbacks queued, and the - * fifth argument is the total number of callbacks queued. - */ -TRACE_EVENT_RCU(rcu_kvfree_callback, - - TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset, - long qlen), - - TP_ARGS(rcuname, rhp, offset, qlen), - - TP_STRUCT__entry( - __field(const char *, rcuname) - __field(void *, rhp) - __field(unsigned long, offset) - __field(long, qlen) - ), - - TP_fast_assign( - __entry->rcuname = rcuname; - __entry->rhp = rhp; - __entry->offset = offset; - __entry->qlen = qlen; - ), - - TP_printk("%s rhp=%p func=%ld %ld", - __entry->rcuname, __entry->rhp, __entry->offset, - __entry->qlen) -); - /* * Tracepoint for marking the beginning rcu_do_batch, performed to start * RCU callback invocation. The first argument is the RCU flavor, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 475f31deed141..5dbc4189037c5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2931,13 +2931,8 @@ static int __init rcu_spawn_core_kthreads(void) static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) { rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kvfree_rcu_offset((unsigned long)func)) - trace_rcu_kvfree_callback(rcu_state.name, head, - (unsigned long)func, - rcu_segcblist_n_cbs(&rdp->cblist)); - else - trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_callback(rcu_state.name, head, + rcu_segcblist_n_cbs(&rdp->cblist)); trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); } -- GitLab From 49d5377b38aa127451cf5dc6d6ea5d9da7f465a4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:49 +0100 Subject: [PATCH 065/934] rcu, slab: use a regular callback function for kvfree_rcu RCU has been special-casing callback function pointers that are integers lower than 4096 as offsets of rcu_head for kvfree() instead. The tree RCU implementation no longer does that as the batched kvfree_rcu() is not a simple call_rcu(). The tiny RCU still does, and the plan is also to make tree RCU use call_rcu() for SLUB_TINY configurations. Instead of teaching tree RCU again to special case the offsets, let's remove the special casing completely. Since there's no SLOB anymore, it is possible to create a callback function that can take a pointer to a middle of slab object with unknown offset and determine the object's pointer before freeing it, so implement that as kvfree_rcu_cb(). Large kmalloc and vmalloc allocations are handled simply by aligning down to page size. For that we retain the requirement that the offset is smaller than 4096. But we can remove __is_kvfree_rcu_offset() completely and instead just opencode the condition in the BUILD_BUG_ON() check. Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 28 ++++++++++++------------ kernel/rcu/tiny.c | 14 ------------ mm/slab.h | 2 ++ mm/slab_common.c | 5 ++--- mm/slub.c | 46 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 63 insertions(+), 32 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 3f70d1c814442..23bcf71ffb06e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1025,12 +1025,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) -/* - * Does the specified offset indicate that the corresponding rcu_head - * structure can be handled by kvfree_rcu()? - */ -#define __is_kvfree_rcu_offset(offset) ((offset) < 4096) - /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree for double-argument invocations. @@ -1041,11 +1035,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * - * The kfree_rcu() function handles this issue. Rather than encoding a - * function address in the embedded rcu_head structure, kfree_rcu() instead - * encodes the offset of the rcu_head structure within the base structure. - * Because the functions are not allowed in the low-order 4096 bytes of - * kernel virtual memory, offsets up to 4095 bytes can be accommodated. + * The kfree_rcu() function handles this issue. In order to have a universal + * callback function handling different offsets of rcu_head, the callback needs + * to determine the starting address of the freed object, which can be a large + * kmalloc or vmalloc allocation. To allow simply aligning the pointer down to + * page boundary for those, only offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to @@ -1087,14 +1081,18 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) */ void kvfree_call_rcu(struct rcu_head *head, void *ptr); +/* + * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the + * comment of kfree_rcu() for details. + */ #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ - if (___p) { \ - BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \ - kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ - } \ + if (___p) { \ + BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096); \ + kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ + } \ } while (0) #define kvfree_rcu_arg_1(ptr) \ diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 0ec27093d0e14..7a34a99d4664f 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -85,15 +85,8 @@ void rcu_sched_clock_irq(int user) static inline bool rcu_reclaim_tiny(struct rcu_head *head) { rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kvfree_rcu_offset(offset)) { - trace_rcu_invoke_kvfree_callback("", head, offset); - kvfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } trace_rcu_invoke_callback("", head); f = head->func; @@ -159,10 +152,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -static void tiny_rcu_leak_callback(struct rcu_head *rhp) -{ -} - /* * Post an RCU callback to be invoked after the end of an RCU grace * period. But since we have but one CPU, that would be after any @@ -178,9 +167,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); mem_dump_obj(head); } - - if (!__is_kvfree_rcu_offset((unsigned long)head->func)) - WRITE_ONCE(head->func, tiny_rcu_leak_callback); return; } diff --git a/mm/slab.h b/mm/slab.h index e9fd9bf0bfa65..2f01c7317988c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -604,6 +604,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts); #endif +void kvfree_rcu_cb(struct rcu_head *head); + size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) diff --git a/mm/slab_common.c b/mm/slab_common.c index 81a0ce77b11c2..6438a38aa5dc2 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1290,7 +1290,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) { if (head) { kasan_record_aux_stack(ptr); - call_rcu(head, (rcu_callback_t) ((void *) head - ptr)); + call_rcu(head, kvfree_rcu_cb); return; } @@ -1551,8 +1551,7 @@ kvfree_rcu_list(struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kvfree_callback("slab", head, offset); - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); + kvfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); diff --git a/mm/slub.c b/mm/slub.c index 1f50129dcfb3c..e8273f2865693 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -19,6 +19,7 @@ #include #include #include "slab.h" +#include #include #include #include @@ -4728,6 +4729,51 @@ static void free_large_kmalloc(struct folio *folio, void *object) folio_put(folio); } +/* + * Given an rcu_head embedded within an object obtained from kvmalloc at an + * offset < 4k, free the object in question. + */ +void kvfree_rcu_cb(struct rcu_head *head) +{ + void *obj = head; + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *slab_addr; + + if (is_vmalloc_addr(obj)) { + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + vfree(obj); + return; + } + + folio = virt_to_folio(obj); + if (!folio_test_slab(folio)) { + /* + * rcu_head offset can be only less than page size so no need to + * consider folio order + */ + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + free_large_kmalloc(folio, obj); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + slab_addr = folio_address(folio); + + if (is_kfence_address(obj)) { + obj = kfence_object_start(obj); + } else { + unsigned int idx = __obj_to_index(s, slab_addr, obj); + + obj = slab_addr + s->size * idx; + obj = fixup_red_left(s, obj); + } + + slab_free(s, slab, obj, _RET_IP_); +} + /** * kfree - free previously allocated memory * @object: pointer returned by kmalloc() or kmem_cache_alloc() -- GitLab From c9f8f1242a4c3e48adc6c3cf6b31c1ffbaa49943 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:50 +0100 Subject: [PATCH 066/934] slab: don't batch kvfree_rcu() with SLUB_TINY kvfree_rcu() is batched for better performance except on TINY_RCU, which is a simple implementation for small UP systems. Similarly SLUB_TINY is an option intended for small systems, whether or not used together with TINY_RCU. In case SLUB_TINY is used with !TINY_RCU, it makes arguably sense to not do the batching and limit the memory footprint. It's also suboptimal to have RCU-specific #ifdefs in slab code. With that, add CONFIG_KVFREE_RCU_BATCHED to determine whether batching kvfree_rcu() implementation is used. It is not set by a user prompt, but enabled by default and disabled in case TINY_RCU or SLUB_TINY are enabled. Use the new config for #ifdef's in slab code and extend their scope to cover all code used by the batched kvfree_rcu(). For example there's no need to perform kvfree_rcu_init() if the batching is disabled. Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 2 +- mm/Kconfig | 4 ++++ mm/slab_common.c | 15 +++++++++------ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index bcc62e5656c35..7686054dd494c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1083,7 +1083,7 @@ extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); -#ifdef CONFIG_TINY_RCU +#ifndef CONFIG_KVFREE_RCU_BATCHED static inline void kvfree_rcu_barrier(void) { rcu_barrier(); diff --git a/mm/Kconfig b/mm/Kconfig index 1b501db064172..0b7f4bb5cb801 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -242,6 +242,10 @@ menu "Slab allocator options" config SLUB def_bool y +config KVFREE_RCU_BATCHED + def_bool y + depends on !SLUB_TINY && !TINY_RCU + config SLUB_TINY bool "Configure for minimal memory footprint" depends on EXPERT diff --git a/mm/slab_common.c b/mm/slab_common.c index 6438a38aa5dc2..46d0a4cd33b59 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1284,7 +1284,7 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); -#ifdef CONFIG_TINY_RCU +#ifndef CONFIG_KVFREE_RCU_BATCHED void kvfree_call_rcu(struct rcu_head *head, void *ptr) { @@ -1301,7 +1301,11 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) } EXPORT_SYMBOL_GPL(kvfree_call_rcu); -#endif +void __init kvfree_rcu_init(void) +{ +} + +#else /* CONFIG_KVFREE_RCU_BATCHED */ /* * This rcu parameter is runtime-read-only. It reflects @@ -1879,8 +1883,6 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, return true; } -#if !defined(CONFIG_TINY_RCU) - static enum hrtimer_restart schedule_page_work_fn(struct hrtimer *t) { @@ -2089,8 +2091,6 @@ void kvfree_rcu_barrier(void) } EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); -#endif /* #if !defined(CONFIG_TINY_RCU) */ - static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -2180,3 +2180,6 @@ void __init kvfree_rcu_init(void) shrinker_register(kfree_rcu_shrinker); } + +#endif /* CONFIG_KVFREE_RCU_BATCHED */ + -- GitLab From f08d0c3a71114bb36d1722506d926bd497182781 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 30 Jan 2025 20:40:26 +0000 Subject: [PATCH 067/934] pidfd: add PIDFD_SELF* sentinels to refer to own thread/process It is useful to be able to utilise the pidfd mechanism to reference the current thread or process (from a userland point of view - thread group leader from the kernel's point of view). Therefore introduce PIDFD_SELF_THREAD to refer to the current thread, and PIDFD_SELF_THREAD_GROUP to refer to the current thread group leader. For convenience and to avoid confusion from userland's perspective we alias these: * PIDFD_SELF is an alias for PIDFD_SELF_THREAD - This is nearly always what the user will want to use, as they would find it surprising if for instance fd's were unshared()'d and they wanted to invoke pidfd_getfd() and that failed. * PIDFD_SELF_PROCESS is an alias for PIDFD_SELF_THREAD_GROUP - Most users have no concept of thread groups or what a thread group leader is, and from userland's perspective and nomenclature this is what userland considers to be a process. We adjust pidfd_get_task() and the pidfd_send_signal() system call with specific handling for this, implementing this functionality for process_madvise(), process_mrelease() (albeit, using it here wouldn't really make sense) and pidfd_send_signal(). Signed-off-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/24315a16a3d01a548dd45c7515f7d51c767e954e.1738268370.git.lorenzo.stoakes@oracle.com Signed-off-by: Christian Brauner --- include/uapi/linux/pidfd.h | 24 +++++++++ kernel/pid.c | 24 +++++++-- kernel/signal.c | 105 ++++++++++++++++++++++--------------- 3 files changed, 106 insertions(+), 47 deletions(-) diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 4540f6301b8cd..e0abd0b18841a 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -23,6 +23,30 @@ #define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */ +/* + * The concept of process and threads in userland and the kernel is a confusing + * one - within the kernel every thread is a 'task' with its own individual PID, + * however from userland's point of view threads are grouped by a single PID, + * which is that of the 'thread group leader', typically the first thread + * spawned. + * + * To cut the Gideon knot, for internal kernel usage, we refer to + * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel + * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread + * group leader... + */ +#define PIDFD_SELF_THREAD -10000 /* Current thread. */ +#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ + +/* + * ...and for userland we make life simpler - PIDFD_SELF refers to the current + * thread, PIDFD_SELF_PROCESS refers to the process thread group leader. + * + * For nearly all practical uses, a user will want to use PIDFD_SELF. + */ +#define PIDFD_SELF PIDFD_SELF_THREAD +#define PIDFD_SELF_PROCESS PIDFD_SELF_THREAD_GROUP + struct pidfd_info { /* * This mask is similar to the request_mask in statx(2). diff --git a/kernel/pid.c b/kernel/pid.c index 924084713be8b..22f5d2b2e2909 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -564,15 +564,29 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) */ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) { - unsigned int f_flags; + unsigned int f_flags = 0; struct pid *pid; struct task_struct *task; + enum pid_type type; - pid = pidfd_get_pid(pidfd, &f_flags); - if (IS_ERR(pid)) - return ERR_CAST(pid); + switch (pidfd) { + case PIDFD_SELF_THREAD: + type = PIDTYPE_PID; + pid = get_task_pid(current, type); + break; + case PIDFD_SELF_THREAD_GROUP: + type = PIDTYPE_TGID; + pid = get_task_pid(current, type); + break; + default: + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) + return ERR_CAST(pid); + type = PIDTYPE_TGID; + break; + } - task = get_pid_task(pid, PIDTYPE_TGID); + task = get_pid_task(pid, type); put_pid(pid); if (!task) return ERR_PTR(-ESRCH); diff --git a/kernel/signal.c b/kernel/signal.c index 875e97f6205a2..081f19a24506d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4009,6 +4009,47 @@ static struct pid *pidfd_to_pid(const struct file *file) (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \ PIDFD_SIGNAL_PROCESS_GROUP) +static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type, + siginfo_t __user *info, unsigned int flags) +{ + kernel_siginfo_t kinfo; + + switch (flags) { + case PIDFD_SIGNAL_THREAD: + type = PIDTYPE_PID; + break; + case PIDFD_SIGNAL_THREAD_GROUP: + type = PIDTYPE_TGID; + break; + case PIDFD_SIGNAL_PROCESS_GROUP: + type = PIDTYPE_PGID; + break; + } + + if (info) { + int ret; + + ret = copy_siginfo_from_user_any(&kinfo, info); + if (unlikely(ret)) + return ret; + + if (unlikely(sig != kinfo.si_signo)) + return -EINVAL; + + /* Only allow sending arbitrary signals to yourself. */ + if ((task_pid(current) != pid || type > PIDTYPE_TGID) && + (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) + return -EPERM; + } else { + prepare_kill_siginfo(sig, &kinfo, type); + } + + if (type == PIDTYPE_PGID) + return kill_pgrp_info(sig, &kinfo, pid); + + return kill_pid_info_type(sig, &kinfo, pid, type); +} + /** * sys_pidfd_send_signal - Signal a process through a pidfd * @pidfd: file descriptor of the process @@ -4026,9 +4067,7 @@ static struct pid *pidfd_to_pid(const struct file *file) SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, siginfo_t __user *, info, unsigned int, flags) { - int ret; struct pid *pid; - kernel_siginfo_t kinfo; enum pid_type type; /* Enforce flags be set to 0 until we add an extension. */ @@ -4039,57 +4078,39 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; - CLASS(fd, f)(pidfd); - if (fd_empty(f)) - return -EBADF; + switch (pidfd) { + case PIDFD_SELF_THREAD: + pid = get_task_pid(current, PIDTYPE_PID); + type = PIDTYPE_PID; + break; + case PIDFD_SELF_THREAD_GROUP: + pid = get_task_pid(current, PIDTYPE_TGID); + type = PIDTYPE_TGID; + break; + default: { + CLASS(fd, f)(pidfd); + if (fd_empty(f)) + return -EBADF; - /* Is this a pidfd? */ - pid = pidfd_to_pid(fd_file(f)); - if (IS_ERR(pid)) - return PTR_ERR(pid); + /* Is this a pidfd? */ + pid = pidfd_to_pid(fd_file(f)); + if (IS_ERR(pid)) + return PTR_ERR(pid); - if (!access_pidfd_pidns(pid)) - return -EINVAL; + if (!access_pidfd_pidns(pid)) + return -EINVAL; - switch (flags) { - case 0: /* Infer scope from the type of pidfd. */ if (fd_file(f)->f_flags & PIDFD_THREAD) type = PIDTYPE_PID; else type = PIDTYPE_TGID; - break; - case PIDFD_SIGNAL_THREAD: - type = PIDTYPE_PID; - break; - case PIDFD_SIGNAL_THREAD_GROUP: - type = PIDTYPE_TGID; - break; - case PIDFD_SIGNAL_PROCESS_GROUP: - type = PIDTYPE_PGID; - break; - } - if (info) { - ret = copy_siginfo_from_user_any(&kinfo, info); - if (unlikely(ret)) - return ret; - - if (unlikely(sig != kinfo.si_signo)) - return -EINVAL; - - /* Only allow sending arbitrary signals to yourself. */ - if ((task_pid(current) != pid || type > PIDTYPE_TGID) && - (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) - return -EPERM; - } else { - prepare_kill_siginfo(sig, &kinfo, type); + return do_pidfd_send_signal(pid, sig, type, info, flags); + } } - if (type == PIDTYPE_PGID) - return kill_pgrp_info(sig, &kinfo, pid); - else - return kill_pid_info_type(sig, &kinfo, pid, type); + return do_pidfd_send_signal(pid, sig, type, info, flags); } static int -- GitLab From e943271f7956e439e4c9ef3b7a13f7f00f6c9885 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Feb 2025 13:54:56 +0100 Subject: [PATCH 068/934] selftests/pidfd: add new PIDFD_SELF* defines They will be needed in selftests in follow-up patches. Signed-off-by: Christian Brauner --- tools/testing/selftests/pidfd/pidfd.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index 0b96ac4b8ce5f..027ebaf148441 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -50,6 +50,22 @@ #define PIDFD_NONBLOCK O_NONBLOCK #endif +#ifndef PIDFD_SELF_THREAD +#define PIDFD_SELF_THREAD -10000 /* Current thread. */ +#endif + +#ifndef PIDFD_SELF_THREAD_GROUP +#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ +#endif + +#ifndef PIDFD_SELF +#define PIDFD_SELF PIDFD_SELF_THREAD +#endif + +#ifndef PIDFD_SELF_PROCESS +#define PIDFD_SELF_PROCESS PIDFD_SELF_THREAD_GROUP +#endif + /* * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c * That means, when it wraps around any pid < 300 will be skipped. -- GitLab From 2bbf47f2d396ee32068042a99ecf4da955ce88ec Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 30 Jan 2025 20:40:30 +0000 Subject: [PATCH 069/934] selftests/pidfd: add tests for PIDFD_SELF_* Add tests to assert that PIDFD_SELF* correctly refers to the current thread and process. We explicitly test pidfd_send_signal(), however We defer testing of mm-specific functionality which uses pidfd, namely process_madvise() and process_mrelease() to mm testing (though note the latter can not be sensibly tested as it would require the testing process to be dying). We also correct the pidfd_open_test.c fields which refer to .request_mask whereas the UAPI header refers to .mask, which otherwise break the import of the UAPI header. Signed-off-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/7ab0e48b26ba53abf7b703df2dd11a2e99b8efb2.1738268370.git.lorenzo.stoakes@oracle.com Reviewed-by: Shakeel Butt Signed-off-by: Christian Brauner --- .../testing/selftests/pidfd/pidfd_open_test.c | 6 +- tools/testing/selftests/pidfd/pidfd_test.c | 76 ++++++++++++++++--- 2 files changed, 67 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c index ce413a221bacd..9a40ccb1ff6dc 100644 --- a/tools/testing/selftests/pidfd/pidfd_open_test.c +++ b/tools/testing/selftests/pidfd/pidfd_open_test.c @@ -31,7 +31,7 @@ #define PIDFD_INFO_CGROUPID (1UL << 0) struct pidfd_info { - __u64 request_mask; + __u64 mask; __u64 cgroupid; __u32 pid; __u32 tgid; @@ -148,7 +148,7 @@ out: int main(int argc, char **argv) { struct pidfd_info info = { - .request_mask = PIDFD_INFO_CGROUPID, + .mask = PIDFD_INFO_CGROUPID, }; int pidfd = -1, ret = 1; pid_t pid; @@ -227,7 +227,7 @@ int main(int argc, char **argv) getegid(), info.sgid); goto on_error; } - if ((info.request_mask & PIDFD_INFO_CGROUPID) && info.cgroupid == 0) { + if ((info.mask & PIDFD_INFO_CGROUPID) && info.cgroupid == 0) { ksft_print_msg("cgroupid should not be 0 when PIDFD_INFO_CGROUPID is set\n"); goto on_error; } diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index e9728e86b4f25..fcd85cad9f18f 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -42,12 +42,41 @@ static pid_t pidfd_clone(int flags, int *pidfd, int (*fn)(void *)) #endif } -static int signal_received; +static pthread_t signal_received; static void set_signal_received_on_sigusr1(int sig) { if (sig == SIGUSR1) - signal_received = 1; + signal_received = pthread_self(); +} + +static int send_signal(int pidfd) +{ + int ret = 0; + + if (sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0) < 0) { + ret = -EINVAL; + goto exit; + } + + if (signal_received != pthread_self()) { + ret = -EINVAL; + goto exit; + } + +exit: + signal_received = 0; + return ret; +} + +static void *send_signal_worker(void *arg) +{ + int pidfd = (int)(intptr_t)arg; + int ret; + + /* We forward any errors for the caller to handle. */ + ret = send_signal(pidfd); + return (void *)(intptr_t)ret; } /* @@ -56,8 +85,11 @@ static void set_signal_received_on_sigusr1(int sig) */ static int test_pidfd_send_signal_simple_success(void) { - int pidfd, ret; + int pidfd; const char *test_name = "pidfd_send_signal send SIGUSR1"; + pthread_t thread; + void *thread_res; + int err; if (!have_pidfd_send_signal) { ksft_test_result_skip( @@ -66,25 +98,45 @@ static int test_pidfd_send_signal_simple_success(void) return 0; } + signal(SIGUSR1, set_signal_received_on_sigusr1); + + /* Try sending a signal to ourselves via /proc/self. */ pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC); if (pidfd < 0) ksft_exit_fail_msg( "%s test: Failed to open process file descriptor\n", test_name); + err = send_signal(pidfd); + if (err) + ksft_exit_fail_msg( + "%s test: Error %d on sending pidfd signal\n", + test_name, err); + close(pidfd); - signal(SIGUSR1, set_signal_received_on_sigusr1); + /* Now try the same thing only using PIDFD_SELF_THREAD_GROUP. */ + err = send_signal(PIDFD_SELF_THREAD_GROUP); + if (err) + ksft_exit_fail_msg( + "%s test: Error %d on PIDFD_SELF_THREAD_GROUP signal\n", + test_name, err); - ret = sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0); - close(pidfd); - if (ret < 0) - ksft_exit_fail_msg("%s test: Failed to send signal\n", + /* + * Now try the same thing in a thread and assert thread ID is equal to + * worker thread ID. + */ + if (pthread_create(&thread, NULL, send_signal_worker, + (void *)(intptr_t)PIDFD_SELF_THREAD)) + ksft_exit_fail_msg("%s test: Failed to create thread\n", test_name); - - if (signal_received != 1) - ksft_exit_fail_msg("%s test: Failed to receive signal\n", + if (pthread_join(thread, &thread_res)) + ksft_exit_fail_msg("%s test: Failed to join thread\n", test_name); + err = (int)(intptr_t)thread_res; + if (err) + ksft_exit_fail_msg( + "%s test: Error %d on PIDFD_SELF_THREAD signal\n", + test_name, err); - signal_received = 0; ksft_test_result_pass("%s test: Sent signal\n", test_name); return 0; } -- GitLab From 62d648c4429a5edcfcfae03da15d2268d66e1a78 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 30 Jan 2025 20:40:31 +0000 Subject: [PATCH 070/934] selftests/mm: use PIDFD_SELF in guard pages test Now we have PIDFD_SELF available for process_madvise(), make use of it in the guard pages test. This is both more convenient and asserts that PIDFD_SELF works as expected. Signed-off-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/69fbbe088d3424de9983e145228459cb05a8f13d.1738268370.git.lorenzo.stoakes@oracle.com Reviewed-by: Shakeel Butt Signed-off-by: Christian Brauner --- tools/testing/selftests/mm/guard-pages.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/mm/guard-pages.c b/tools/testing/selftests/mm/guard-pages.c index ece37212a8a2e..525c50d3ec238 100644 --- a/tools/testing/selftests/mm/guard-pages.c +++ b/tools/testing/selftests/mm/guard-pages.c @@ -19,6 +19,8 @@ #include #include +#include "../pidfd/pidfd.h" + /* * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1: * @@ -50,11 +52,6 @@ static void handle_fatal(int c) siglongjmp(signal_jmp_buf, c); } -static int pidfd_open(pid_t pid, unsigned int flags) -{ - return syscall(SYS_pidfd_open, pid, flags); -} - static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, size_t n, int advice, unsigned int flags) { @@ -370,14 +367,10 @@ TEST_F(guard_pages, multi_vma) TEST_F(guard_pages, process_madvise) { const unsigned long page_size = self->page_size; - pid_t pid = getpid(); - int pidfd = pidfd_open(pid, 0); char *ptr_region, *ptr1, *ptr2, *ptr3; ssize_t count; struct iovec vec[6]; - ASSERT_NE(pidfd, -1); - /* Reserve region to map over. */ ptr_region = mmap(NULL, 100 * page_size, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0); @@ -425,7 +418,7 @@ TEST_F(guard_pages, process_madvise) ASSERT_EQ(munmap(&ptr_region[99 * page_size], page_size), 0); /* Now guard in one step. */ - count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0); + count = sys_process_madvise(PIDFD_SELF, vec, 6, MADV_GUARD_INSTALL, 0); /* OK we don't have permission to do this, skip. */ if (count == -1 && errno == EPERM) @@ -446,7 +439,7 @@ TEST_F(guard_pages, process_madvise) ASSERT_FALSE(try_read_write_buf(&ptr3[19 * page_size])); /* Now do the same with unguard... */ - count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0); + count = sys_process_madvise(PIDFD_SELF, vec, 6, MADV_GUARD_REMOVE, 0); /* ...and everything should now succeed. */ @@ -463,7 +456,6 @@ TEST_F(guard_pages, process_madvise) ASSERT_EQ(munmap(ptr1, 10 * page_size), 0); ASSERT_EQ(munmap(ptr2, 5 * page_size), 0); ASSERT_EQ(munmap(ptr3, 20 * page_size), 0); - close(pidfd); } /* Assert that unmapping ranges does not leave guard markers behind. */ -- GitLab From ad6b5b73ff565e88aca7a7d1286788d80c97ba71 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:52 -0800 Subject: [PATCH 071/934] rcu: fix header guard for rcu_all_qs() rcu_all_qs() is defined for !CONFIG_PREEMPT_RCU but the declaration is conditioned on CONFIG_PREEMPTION. With CONFIG_PREEMPT_LAZY, CONFIG_PREEMPTION=y does not imply CONFIG_PREEMPT_RCU=y. Decouple the two. Cc: Paul E. McKenney Reviewed-by: Frederic Weisbecker Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Ankur Arora Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/rcutree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 27d86d9127817..aad586f15ed0c 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -103,7 +103,7 @@ extern int rcu_scheduler_active; void rcu_end_inkernel_boot(void); bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); -#ifndef CONFIG_PREEMPTION +#ifndef CONFIG_PREEMPT_RCU void rcu_all_qs(void); #endif -- GitLab From 4dca1af414fb1f27c3350a65820cb0b91178e8fe Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:53 -0800 Subject: [PATCH 072/934] rcu: rename PREEMPT_AUTO to PREEMPT_LAZY Replace mentions of PREEMPT_AUTO with PREEMPT_LAZY. Also, since PREMPT_LAZY implies PREEMPTION, we can reduce the TASKS_RCU selection criteria from this: NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO) to this: NEED_TASKS_RCU && PREEMPTION CC: Paul E. McKenney Reviewed-by: Frederic Weisbecker Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Ankur Arora Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/srcutiny.h | 2 +- kernel/rcu/Kconfig | 2 +- kernel/rcu/srcutiny.c | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 1321da803274d..31b59b4be2a74 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -64,7 +64,7 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) { int idx; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1); preempt_enable(); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b9b6bc55185db..e2206f3a070ca 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -91,7 +91,7 @@ config NEED_TASKS_RCU config TASKS_RCU bool - default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO) + default NEED_TASKS_RCU && PREEMPTION select IRQ_WORK config FORCE_TASKS_RUDE_RCU diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 4dcbf8aa80ff7..f688bdad293ed 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -98,7 +98,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { int newval; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); preempt_enable(); @@ -120,7 +120,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { preempt_enable(); return; /* Already running or nothing to do. */ @@ -138,7 +138,7 @@ void srcu_drive_gp(struct work_struct *wp) WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ preempt_enable(); swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); preempt_enable(); @@ -159,7 +159,7 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_running, false); idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)); preempt_enable(); @@ -172,7 +172,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY cookie = get_state_synchronize_srcu(ssp); if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { preempt_enable(); @@ -199,7 +199,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY local_irq_save(flags); *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; @@ -261,7 +261,7 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) { unsigned long ret; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY ret = get_state_synchronize_srcu(ssp); srcu_gp_start_if_needed(ssp); preempt_enable(); -- GitLab From 2c00e1199c060880a215b0d2b495b7738e8c69d7 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:54 -0800 Subject: [PATCH 073/934] sched: update __cond_resched comment about RCU quiescent states Update comment in __cond_resched() clarifying how urgently needed quiescent state are provided. Signed-off-by: Ankur Arora Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 165c90ba64ea9..d328707626e30 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7289,7 +7289,7 @@ int __sched __cond_resched(void) return 1; } /* - * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick * whether the current CPU is in an RCU read-side critical section, * so the tick can report quiescent states even for CPUs looping * in kernel context. In contrast, in non-preemptible kernels, @@ -7298,6 +7298,8 @@ int __sched __cond_resched(void) * RCU quiescent state. Therefore, the following code causes * cond_resched() to report a quiescent state, but only when RCU * is in urgent need of one. + * A third case, preemptible, but non-PREEMPT_RCU provides for + * urgently needed quiescent states via rcu_flavor_sched_clock_irq(). */ #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); -- GitLab From fcf0e25ad4c8d14d2faab4d9a17040f31efce205 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:55 -0800 Subject: [PATCH 074/934] rcu: handle unstable rdp in rcu_read_unlock_strict() rcu_read_unlock_strict() can be called with preemption enabled which can make for an unstable rdp and a racy norm value. Fix this by dropping the preempt-count in __rcu_read_unlock() after the call to rcu_read_unlock_strict(), adjusting the preempt-count check appropriately. Suggested-by: Frederic Weisbecker Signed-off-by: Ankur Arora Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/rcupdate.h | 2 +- kernel/rcu/tree_plugin.h | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd8..257e9ae344147 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -95,9 +95,9 @@ static inline void __rcu_read_lock(void) static inline void __rcu_read_unlock(void) { - preempt_enable(); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) rcu_read_unlock_strict(); + preempt_enable(); } static inline int rcu_preempt_depth(void) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3600152b858e8..9573408a98004 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -833,8 +833,17 @@ void rcu_read_unlock_strict(void) { struct rcu_data *rdp; - if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread) return; + + /* + * rcu_report_qs_rdp() can only be invoked with a stable rdp and + * from the local CPU. + * + * The in_atomic_preempt_off() check ensures that we come here holding + * the last preempt_count (which will get dropped once we return to + * __rcu_read_unlock(). + */ rdp = this_cpu_ptr(&rcu_data); rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); -- GitLab From 83b28cfe796464ebbde1cf7916c126da6d572685 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:56 -0800 Subject: [PATCH 075/934] rcu: handle quiescent states for PREEMPT_RCU=n, PREEMPT_COUNT=y With PREEMPT_RCU=n, cond_resched() provides urgently needed quiescent states for read-side critical sections via rcu_all_qs(). One reason why this was needed: lacking preempt-count, the tick handler has no way of knowing whether it is executing in a read-side critical section or not. With (PREEMPT_LAZY=y, PREEMPT_DYNAMIC=n), we get (PREEMPT_COUNT=y, PREEMPT_RCU=n). In this configuration cond_resched() is a stub and does not provide quiescent states via rcu_all_qs(). (PREEMPT_RCU=y provides this information via rcu_read_unlock() and its nesting counter.) So, use the availability of preempt_count() to report quiescent states in rcu_flavor_sched_clock_irq(). Suggested-by: Paul E. McKenney Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Ankur Arora Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree_plugin.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9573408a98004..3c0bbbbb686fe 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -984,13 +984,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) */ static void rcu_flavor_sched_clock_irq(int user) { - if (user || rcu_is_cpu_rrupt_from_idle()) { + if (user || rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && + (preempt_count() == HARDIRQ_OFFSET))) { /* * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. + * mode, from the idle loop without this being a nested + * interrupt, or while not holding the task preempt count + * (with PREEMPT_COUNT=y). In this case, the CPU is in a + * quiescent state, so note it. * * No memory barrier is required here because rcu_qs() * references only CPU-local variables that other CPUs -- GitLab From da2ac5623716cc3f32eeeafef9b7c08c73c2b20a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Jun 2024 08:43:02 -0700 Subject: [PATCH 076/934] srcu: Make Tiny SRCU able to operate in preemptible kernels Given that SRCU allows its read-side critical sections are not just preemptible, but also allow general blocking, there is not much reason to restrict Tiny SRCU to non-preemptible kernels. This commit therefore removes Tiny SRCU dependencies on non-preemptibility, primarily surrounding its interaction with rcutorture and early boot. Signed-off-by: Paul E. McKenney Cc: Ankur Arora Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- kernel/rcu/rcu.h | 9 ++++++--- kernel/rcu/srcutiny.c | 6 ++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index feb3ac1dc5d59..2909662c805f5 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -611,8 +611,6 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } -static inline unsigned long -srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; } static inline void show_rcu_gp_kthreads(void) { } @@ -624,7 +622,6 @@ static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } bool rcu_watching_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); -unsigned long srcu_batches_completed(struct srcu_struct *sp); bool rcu_check_boost_fail(unsigned long gp_state, int *cpup); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); @@ -636,6 +633,12 @@ void rcu_gp_slow_register(atomic_t *rgssp); void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TINY_SRCU +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; } +#else // #ifdef CONFIG_TINY_SRCU +unsigned long srcu_batches_completed(struct srcu_struct *sp); +#endif // #else // #ifdef CONFIG_TINY_SRCU + #ifdef CONFIG_RCU_NOCB_CPU void rcu_bind_current_to_nocb(void); #else diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 4dcbf8aa80ff7..2a94f0e656065 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -20,7 +20,11 @@ #include "rcu_segcblist.h" #include "rcu.h" +#ifndef CONFIG_TREE_RCU int rcu_scheduler_active __read_mostly; +#else // #ifndef CONFIG_TREE_RCU +extern int rcu_scheduler_active; +#endif // #else // #ifndef CONFIG_TREE_RCU static LIST_HEAD(srcu_boot_list); static bool srcu_init_done; @@ -282,11 +286,13 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); +#ifndef CONFIG_TREE_RCU /* Lockdep diagnostics. */ void __init rcu_scheduler_starting(void) { rcu_scheduler_active = RCU_SCHEDULER_RUNNING; } +#endif // #ifndef CONFIG_TREE_RCU /* * Queue work for srcu_struct structures with early boot callbacks. -- GitLab From b459874faa7b6096982552498cab735075e53013 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Dec 2024 12:39:58 -0800 Subject: [PATCH 077/934] srcu: Define SRCU_READ_FLAVOR_ALL in terms of symbols This commit defines SRCU_READ_FLAVOR_ALL in terms of the SRCU_READ_FLAVOR_* definitions instead of a hexadecimal constant. Suggested-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index d7ba46e74f58e..f6f779b9d9ff2 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -47,7 +47,8 @@ int init_srcu_struct(struct srcu_struct *ssp); #define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). #define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). #define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). -#define SRCU_READ_FLAVOR_ALL 0x7 // All of the above. +#define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ + SRCU_READ_FLAVOR_LITE) // All of the above. #ifdef CONFIG_TINY_SRCU #include -- GitLab From 5f9e1bc50a046578ddbfb05cda0f053d856bef98 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Dec 2024 16:16:32 -0800 Subject: [PATCH 078/934] srcu: Use ->srcu_gp_seq for rcutorture reader batch This commit stops using ->srcu_idx for rcutorture's reader-batch consistency checking, using ->srcu_gp_seq instead. This is a first step towards a faster srcu_read_{,un}lock_lite() that avoids the array accesses that use ->srcu_idx. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 2 ++ kernel/rcu/srcutree.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d26fb1d33ed9a..1d2de50fb5d60 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -791,6 +791,7 @@ static struct rcu_torture_ops srcu_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -834,6 +835,7 @@ static struct rcu_torture_ops srcud_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b83c74c4dcc0d..e69ce9d59abf6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1675,7 +1675,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return READ_ONCE(ssp->srcu_idx); + return READ_ONCE(ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcu_batches_completed); -- GitLab From 56eb8be144c2bdb3a96a0d4365777fc64c65c5d4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Dec 2024 11:13:51 -0800 Subject: [PATCH 079/934] srcu: Pull ->srcu_{un,}lock_count into a new srcu_ctr structure This commit prepares for array-index-free srcu_read_lock*() by moving the ->srcu_{un,}lock_count fields into a new srcu_ctr structure. This will permit ->srcu_index to be replaced by a per-CPU pointer to this structure. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 13 +++-- kernel/rcu/srcutree.c | 115 +++++++++++++++++++-------------------- 2 files changed, 66 insertions(+), 62 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index b17814c9d1c76..c794d599db5c1 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -17,14 +17,19 @@ struct srcu_node; struct srcu_struct; +/* One element of the srcu_data srcu_ctrs array. */ +struct srcu_ctr { + atomic_long_t srcu_locks; /* Locks per CPU. */ + atomic_long_t srcu_unlocks; /* Unlocks per CPU. */ +}; + /* * Per-CPU structure feeding into leaf srcu_node, similar in function * to rcu_node. */ struct srcu_data { /* Read-side state. */ - atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ - atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ + struct srcu_ctr srcu_ctrs[2]; /* Locks and unlocks per CPU. */ int srcu_reader_flavor; /* Reader flavor for srcu_struct structure? */ /* Values: SRCU_READ_FLAVOR_.* */ @@ -221,7 +226,7 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */ + this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_locks.counter); /* Y */ barrier(); /* Avoid leaking the critical section. */ return idx; } @@ -240,7 +245,7 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) { barrier(); /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); /* Z */ + this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_unlocks.counter); /* Z */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e69ce9d59abf6..d7ee2f345e192 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -116,8 +116,9 @@ do { \ /* * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and - * srcu_read_unlock() running against them. So if the is_static parameter - * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. + * srcu_read_unlock() running against them. So if the is_static + * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and + * ->srcu_ctrs[].srcu_unlocks. */ static void init_srcu_struct_data(struct srcu_struct *ssp) { @@ -128,8 +129,6 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); @@ -429,10 +428,10 @@ static bool srcu_gp_is_expedited(struct srcu_struct *ssp) } /* - * Computes approximate total of the readers' ->srcu_lock_count[] values - * for the rank of per-CPU counters specified by idx, and returns true if - * the caller did the proper barrier (gp), and if the count of the locks - * matches that of the unlocks passed in. + * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks + * values for the rank of per-CPU counters specified by idx, and returns + * true if the caller did the proper barrier (gp), and if the count of + * the locks matches that of the unlocks passed in. */ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) { @@ -443,7 +442,7 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks); if (IS_ENABLED(CONFIG_PROVE_RCU)) mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } @@ -455,8 +454,8 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns } /* - * Returns approximate total of the readers' ->srcu_unlock_count[] values - * for the rank of per-CPU counters specified by idx. + * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks + * values for the rank of per-CPU counters specified by idx. */ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { @@ -467,7 +466,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, u for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_unlock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks); mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), @@ -510,9 +509,9 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * been no readers on this index at some point in this function. * But there might be more readers, as a task might have read * the current ->srcu_idx but not yet have incremented its CPU's - * ->srcu_lock_count[idx] counter. In fact, it is possible + * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible * that most of the tasks have been preempted between fetching - * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there + * ->srcu_idx and incrementing ->srcu_ctrs[idx].srcu_locks. And there * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks * in a system whose address space was fully populated with memory. * Call this quantity Nt. @@ -521,36 +520,36 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * code for a long time. That now-preempted updater has already * flipped ->srcu_idx (possibly during the preceding grace period), * done an smp_mb() (again, possibly during the preceding grace - * period), and summed up the ->srcu_unlock_count[idx] counters. + * period), and summed up the ->srcu_ctrs[idx].srcu_unlocks counters. * How many times can a given one of the aforementioned Nt tasks - * increment the old ->srcu_idx value's ->srcu_lock_count[idx] + * increment the old ->srcu_idx value's ->srcu_ctrs[idx].srcu_locks * counter, in the absence of nesting? * * It can clearly do so once, given that it has already fetched - * the old value of ->srcu_idx and is just about to use that value - * to index its increment of ->srcu_lock_count[idx]. But as soon as - * it leaves that SRCU read-side critical section, it will increment - * ->srcu_unlock_count[idx], which must follow the updater's above - * read from that same value. Thus, as soon the reading task does - * an smp_mb() and a later fetch from ->srcu_idx, that task will be - * guaranteed to get the new index. Except that the increment of - * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the - * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock() - * is before the smp_mb(). Thus, that task might not see the new - * value of ->srcu_idx until the -second- __srcu_read_lock(), - * which in turn means that this task might well increment - * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice, - * not just once. + * the old value of ->srcu_idx and is just about to use that + * value to index its increment of ->srcu_ctrs[idx].srcu_locks. + * But as soon as it leaves that SRCU read-side critical section, + * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must + * follow the updater's above read from that same value. Thus, + * as soon the reading task does an smp_mb() and a later fetch from + * ->srcu_idx, that task will be guaranteed to get the new index. + * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks + * in __srcu_read_unlock() is after the smp_mb(), and the fetch + * from ->srcu_idx in __srcu_read_lock() is before the smp_mb(). + * Thus, that task might not see the new value of ->srcu_idx until + * the -second- __srcu_read_lock(), which in turn means that this + * task might well increment ->srcu_ctrs[idx].srcu_locks for the + * old value of ->srcu_idx twice, not just once. * * However, it is important to note that a given smp_mb() takes * effect not just for the task executing it, but also for any * later task running on that same CPU. * - * That is, there can be almost Nt + Nc further increments of - * ->srcu_lock_count[idx] for the old index, where Nc is the number - * of CPUs. But this is OK because the size of the task_struct - * structure limits the value of Nt and current systems limit Nc - * to a few thousand. + * That is, there can be almost Nt + Nc further increments + * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc + * is the number of CPUs. But this is OK because the size of + * the task_struct structure limits the value of Nt and current + * systems limit Nc to a few thousand. * * OK, but what about nesting? This does impose a limit on * nesting of half of the size of the task_struct structure @@ -581,10 +580,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp) for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[0]); - sum += atomic_long_read(&sdp->srcu_lock_count[1]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[0]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[1]); + sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks); + sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks); + sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks); + sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks); } return sum; } @@ -746,7 +745,7 @@ int __srcu_read_lock(struct srcu_struct *ssp) int idx; idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); + this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_locks.counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ return idx; } @@ -760,7 +759,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); + this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_unlocks.counter); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -777,7 +776,7 @@ int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); idx = READ_ONCE(ssp->srcu_idx) & 0x1; - atomic_long_inc(&sdp->srcu_lock_count[idx]); + atomic_long_inc(&sdp->srcu_ctrs[idx].srcu_locks); smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ return idx; } @@ -793,7 +792,7 @@ void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ - atomic_long_inc(&sdp->srcu_unlock_count[idx]); + atomic_long_inc(&sdp->srcu_ctrs[idx].srcu_unlocks); } EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); @@ -1123,17 +1122,17 @@ static void srcu_flip(struct srcu_struct *ssp) /* * Because the flip of ->srcu_idx is executed only if the * preceding call to srcu_readers_active_idx_check() found that - * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched - * and because that summing uses atomic_long_read(), there is - * ordering due to a control dependency between that summing and - * the WRITE_ONCE() in this call to srcu_flip(). This ordering - * ensures that if this updater saw a given reader's increment from - * __srcu_read_lock(), that reader was using a value of ->srcu_idx - * from before the previous call to srcu_flip(), which should be - * quite rare. This ordering thus helps forward progress because - * the grace period could otherwise be delayed by additional - * calls to __srcu_read_lock() using that old (soon to be new) - * value of ->srcu_idx. + * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums + * matched and because that summing uses atomic_long_read(), + * there is ordering due to a control dependency between that + * summing and the WRITE_ONCE() in this call to srcu_flip(). + * This ordering ensures that if this updater saw a given reader's + * increment from __srcu_read_lock(), that reader was using a value + * of ->srcu_idx from before the previous call to srcu_flip(), + * which should be quite rare. This ordering thus helps forward + * progress because the grace period could otherwise be delayed + * by additional calls to __srcu_read_lock() using that old (soon + * to be new) value of ->srcu_idx. * * This sum-equality check and ordering also ensures that if * a given call to __srcu_read_lock() uses the new value of @@ -1914,8 +1913,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx])); - u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx])); + u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks)); + u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks)); /* * Make sure that a lock is always counted if the corresponding @@ -1923,8 +1922,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx])); - l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx])); + l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks)); + l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks)); c0 = l0 - u0; c1 = l1 - u1; -- GitLab From 795e7efec6ea7e9d597c3fced9f5307fae467cb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Dec 2024 16:08:54 -0800 Subject: [PATCH 080/934] srcu: Make SRCU readers use ->srcu_ctrs for counter selection This commit causes SRCU readers to use ->srcu_ctrs for counter selection instead of ->srcu_idx. This takes another step towards array-indexing-free SRCU readers. [ paulmck: Apply kernel test robot feedback. ] Co-developed-by: Z qiang Signed-off-by: Z qiang Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 9 +++++---- kernel/rcu/srcutree.c | 23 +++++++++++++---------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index c794d599db5c1..1b01ced61a45b 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -101,6 +101,7 @@ struct srcu_usage { */ struct srcu_struct { unsigned int srcu_idx; /* Current rdr array element. */ + struct srcu_ctr __percpu *srcu_ctrp; struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ struct lockdep_map dep_map; struct srcu_usage *srcu_sup; /* Update-side data. */ @@ -167,6 +168,7 @@ struct srcu_struct { #define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name) \ { \ .sda = &pcpu_name, \ + .srcu_ctrp = &pcpu_name.srcu_ctrs[0], \ __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ } @@ -222,13 +224,12 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); */ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_locks.counter); /* Y */ + this_cpu_inc(scp->srcu_locks.counter); /* Y */ barrier(); /* Avoid leaking the critical section. */ - return idx; + return scp - &ssp->sda->srcu_ctrs[0]; } /* diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d7ee2f345e192..7efde1a2344e1 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -253,8 +253,10 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); ssp->srcu_sup->sda_is_static = is_static; - if (!is_static) + if (!is_static) { ssp->sda = alloc_percpu(struct srcu_data); + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; + } if (!ssp->sda) goto err_free_sup; init_srcu_struct_data(ssp); @@ -742,12 +744,11 @@ EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); */ int __srcu_read_lock(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_locks.counter); + this_cpu_inc(scp->srcu_locks.counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return scp - &ssp->sda->srcu_ctrs[0]; } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -772,13 +773,12 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); */ int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) { - int idx; - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp); + struct srcu_ctr *scp = raw_cpu_ptr(scpp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - atomic_long_inc(&sdp->srcu_ctrs[idx].srcu_locks); + atomic_long_inc(&scp->srcu_locks); smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return scpp - &ssp->sda->srcu_ctrs[0]; } EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); @@ -1152,6 +1152,8 @@ static void srcu_flip(struct srcu_struct *ssp) smp_mb(); /* E */ /* Pairs with B and C. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter. + WRITE_ONCE(ssp->srcu_ctrp, + &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]); /* * Ensure that if the updater misses an __srcu_read_unlock() @@ -2000,6 +2002,7 @@ static int srcu_module_coming(struct module *mod) ssp->sda = alloc_percpu(struct srcu_data); if (WARN_ON_ONCE(!ssp->sda)) return -ENOMEM; + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; } return 0; } -- GitLab From 821ca6fa15d864951da89233da8fd89e932d5215 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Dec 2024 16:32:12 -0800 Subject: [PATCH 081/934] srcu: Make Tree SRCU updates independent of ->srcu_idx This commit makes Tree SRCU updates independent of ->srcu_idx, then drop ->srcu_idx. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 1 - kernel/rcu/srcutree.c | 68 ++++++++++++++++++++-------------------- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 1b01ced61a45b..6b7eba59f3849 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -100,7 +100,6 @@ struct srcu_usage { * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - unsigned int srcu_idx; /* Current rdr array element. */ struct srcu_ctr __percpu *srcu_ctrp; struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ struct lockdep_map dep_map; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 7efde1a2344e1..247bdf42fb541 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -246,7 +246,6 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_sup->srcu_cb_mutex); mutex_init(&ssp->srcu_sup->srcu_gp_mutex); - ssp->srcu_idx = 0; ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_barrier_seq = 0; mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); @@ -510,38 +509,39 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * If the locks are the same as the unlocks, then there must have * been no readers on this index at some point in this function. * But there might be more readers, as a task might have read - * the current ->srcu_idx but not yet have incremented its CPU's + * the current ->srcu_ctrp but not yet have incremented its CPU's * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible * that most of the tasks have been preempted between fetching - * ->srcu_idx and incrementing ->srcu_ctrs[idx].srcu_locks. And there - * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks - * in a system whose address space was fully populated with memory. - * Call this quantity Nt. + * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And + * there could be almost (ULONG_MAX / sizeof(struct task_struct)) + * tasks in a system whose address space was fully populated + * with memory. Call this quantity Nt. * - * So suppose that the updater is preempted at this point in the - * code for a long time. That now-preempted updater has already - * flipped ->srcu_idx (possibly during the preceding grace period), - * done an smp_mb() (again, possibly during the preceding grace - * period), and summed up the ->srcu_ctrs[idx].srcu_unlocks counters. - * How many times can a given one of the aforementioned Nt tasks - * increment the old ->srcu_idx value's ->srcu_ctrs[idx].srcu_locks - * counter, in the absence of nesting? + * So suppose that the updater is preempted at this + * point in the code for a long time. That now-preempted + * updater has already flipped ->srcu_ctrp (possibly during + * the preceding grace period), done an smp_mb() (again, + * possibly during the preceding grace period), and summed up + * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times + * can a given one of the aforementioned Nt tasks increment the + * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter, + * in the absence of nesting? * * It can clearly do so once, given that it has already fetched - * the old value of ->srcu_idx and is just about to use that + * the old value of ->srcu_ctrp and is just about to use that * value to index its increment of ->srcu_ctrs[idx].srcu_locks. * But as soon as it leaves that SRCU read-side critical section, * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must - * follow the updater's above read from that same value. Thus, - * as soon the reading task does an smp_mb() and a later fetch from - * ->srcu_idx, that task will be guaranteed to get the new index. + * follow the updater's above read from that same value. Thus, + as soon the reading task does an smp_mb() and a later fetch from + * ->srcu_ctrp, that task will be guaranteed to get the new index. * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks * in __srcu_read_unlock() is after the smp_mb(), and the fetch - * from ->srcu_idx in __srcu_read_lock() is before the smp_mb(). - * Thus, that task might not see the new value of ->srcu_idx until + * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb(). + * Thus, that task might not see the new value of ->srcu_ctrp until * the -second- __srcu_read_lock(), which in turn means that this * task might well increment ->srcu_ctrs[idx].srcu_locks for the - * old value of ->srcu_idx twice, not just once. + * old value of ->srcu_ctrp twice, not just once. * * However, it is important to note that a given smp_mb() takes * effect not just for the task executing it, but also for any @@ -1095,7 +1095,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* * Wait until all readers counted by array index idx complete, but * loop an additional time if there is an expedited grace period pending. - * The caller must ensure that ->srcu_idx is not changed while checking. + * The caller must ensure that ->srcu_ctrp is not changed while checking. */ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { @@ -1113,14 +1113,14 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) } /* - * Increment the ->srcu_idx counter so that future SRCU readers will + * Increment the ->srcu_ctrp counter so that future SRCU readers will * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *ssp) { /* - * Because the flip of ->srcu_idx is executed only if the + * Because the flip of ->srcu_ctrp is executed only if the * preceding call to srcu_readers_active_idx_check() found that * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums * matched and because that summing uses atomic_long_read(), @@ -1128,15 +1128,15 @@ static void srcu_flip(struct srcu_struct *ssp) * summing and the WRITE_ONCE() in this call to srcu_flip(). * This ordering ensures that if this updater saw a given reader's * increment from __srcu_read_lock(), that reader was using a value - * of ->srcu_idx from before the previous call to srcu_flip(), + * of ->srcu_ctrp from before the previous call to srcu_flip(), * which should be quite rare. This ordering thus helps forward * progress because the grace period could otherwise be delayed * by additional calls to __srcu_read_lock() using that old (soon - * to be new) value of ->srcu_idx. + * to be new) value of ->srcu_ctrp. * * This sum-equality check and ordering also ensures that if * a given call to __srcu_read_lock() uses the new value of - * ->srcu_idx, this updater's earlier scans cannot have seen + * ->srcu_ctrp, this updater's earlier scans cannot have seen * that reader's increments, which is all to the good, because * this grace period need not wait on that reader. After all, * if those earlier scans had seen that reader, there would have @@ -1151,7 +1151,6 @@ static void srcu_flip(struct srcu_struct *ssp) */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter. WRITE_ONCE(ssp->srcu_ctrp, &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]); @@ -1466,8 +1465,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * * Wait for the count to drain to zero of both indexes. To avoid the * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, - * and then flip the srcu_idx and wait for the count of the other index. + * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero + * at first, and then flip the ->srcu_ctrp and wait for the count of the + * other index. * * Can block; must be called from process context. * @@ -1693,7 +1693,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) /* * Because readers might be delayed for an extended period after - * fetching ->srcu_idx for their index, at any point in time there + * fetching ->srcu_ctrp for their index, at any point in time there * might well be readers using both idx=0 and idx=1. We therefore * need to wait for readers to clear from both index values before * invoking a callback. @@ -1721,7 +1721,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) } if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) { - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 1)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1739,7 +1739,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) * SRCU read-side critical sections are normally short, * so check at least twice in quick succession after a flip. */ - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 2)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1897,7 +1897,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); int ss_state_idx = ss_state; - idx = ssp->srcu_idx & 0x1; + idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]; if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; pr_alert("%s%s Tree SRCU g%ld state %d (%s)", -- GitLab From d31e31365b5b6c0cdfc74d71be87234ced564395 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jan 2025 17:04:49 -0800 Subject: [PATCH 082/934] srcu: Force synchronization for srcu_get_delay() Currently, srcu_get_delay() can be called concurrently, for example, by a CPU that is the first to request a new grace period and the CPU processing the current grace period. Although concurrent access is harmless, it unnecessarily expands the state space. Additionally, all calls to srcu_get_delay() are from slow paths. This commit therefore protects all calls to srcu_get_delay() with ssp->srcu_sup->lock, which is already held on the invocation from the srcu_funnel_gp_start() function. While in the area, this commit also adds a lockdep_assert_held() to srcu_get_delay() itself. Reported-by: syzbot+16a19b06125a2963eaee@syzkaller.appspotmail.com Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- kernel/rcu/srcutree.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 247bdf42fb541..121dd290cae14 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -648,6 +648,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long jbase = SRCU_INTERVAL; struct srcu_usage *sup = ssp->srcu_sup; + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); if (srcu_gp_is_expedited(ssp)) jbase = 0; if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { @@ -675,9 +676,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; + unsigned long delay; struct srcu_usage *sup = ssp->srcu_sup; - if (WARN_ON(!srcu_get_delay(ssp))) + spin_lock_irq_rcu_node(ssp->srcu_sup); + delay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + if (WARN_ON(!delay)) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ @@ -1101,7 +1106,9 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { unsigned long curdelay; + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = !srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) @@ -1850,7 +1857,9 @@ static void process_srcu(struct work_struct *work) ssp = sup->srcu_ssp; srcu_advance_state(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (curdelay) { WRITE_ONCE(sup->reschedule_count, 0); } else { -- GitLab From 780818a68132d45d074353fc66f5f116074c3c14 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 7 Jan 2025 17:07:34 -0800 Subject: [PATCH 083/934] srcu: Rename srcu_check_read_flavor_lite() to srcu_check_read_flavor_force() This commit renames the srcu_check_read_flavor_lite() function to srcu_check_read_flavor_force() and adds a read_flavor argument in order to support an srcu_read_lock_fast() variant that is to avoid array indexing in both the lock and unlock primitives. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 2 +- include/linux/srcutiny.h | 2 +- include/linux/srcutree.h | 10 ++++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index f6f779b9d9ff2..ca00b9af7c237 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -279,7 +279,7 @@ static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_read_flavor_lite(ssp); + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_LITE); retval = __srcu_read_lock_lite(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 1321da803274d..b347bde1aac29 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -82,7 +82,7 @@ static inline void srcu_barrier(struct srcu_struct *ssp) } #define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) -#define srcu_check_read_flavor_lite(ssp) do { } while (0) +#define srcu_check_read_flavor_force(ssp, read_flavor) do { } while (0) /* Defined here to avoid size increase for non-torture kernels. */ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 6b7eba59f3849..e29cc57eac81d 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -251,16 +251,18 @@ static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); -// Record _lite() usage even for CONFIG_PROVE_RCU=n kernels. -static inline void srcu_check_read_flavor_lite(struct srcu_struct *ssp) +// Record reader usage even for CONFIG_PROVE_RCU=n kernels. This is +// needed only for flavors that require grace-period smp_mb() calls to be +// promoted to synchronize_rcu(). +static inline void srcu_check_read_flavor_force(struct srcu_struct *ssp, int read_flavor) { struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - if (likely(READ_ONCE(sdp->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)) + if (likely(READ_ONCE(sdp->srcu_reader_flavor) & read_flavor)) return; // Note that the cmpxchg() in __srcu_check_read_flavor() is fully ordered. - __srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); + __srcu_check_read_flavor(ssp, read_flavor); } // Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels. -- GitLab From 4d86b1e7e1e98eb1f0e3c5a4635a5c37cbd22919 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 06:48:15 -0800 Subject: [PATCH 084/934] srcu: Add SRCU_READ_FLAVOR_SLOWGP to flag need for synchronize_rcu() This commit switches from a direct test of SRCU_READ_FLAVOR_LITE to a new SRCU_READ_FLAVOR_SLOWGP macro to check for substituting synchronize_rcu() for smp_mb() in SRCU grace periods. Right now, SRCU_READ_FLAVOR_SLOWGP is exactly SRCU_READ_FLAVOR_LITE, but the addition of the _fast() flavor of SRCU will change that. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 3 +++ kernel/rcu/srcutree.c | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index ca00b9af7c237..505f5bdce4446 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -49,6 +49,9 @@ int init_srcu_struct(struct srcu_struct *ssp); #define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). #define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ SRCU_READ_FLAVOR_LITE) // All of the above. +#define SRCU_READ_FLAVOR_SLOWGP SRCU_READ_FLAVOR_LITE + // Flavors requiring synchronize_rcu() + // instead of smp_mb(). #ifdef CONFIG_TINY_SRCU #include diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 121dd290cae14..8b5c50bc98e5d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -449,7 +449,7 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), "Mixed reader flavors for srcu_struct at %ps.\n", ssp); - if (mask & SRCU_READ_FLAVOR_LITE && !gp) + if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp) return false; return sum == unlocks; } @@ -487,7 +487,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) unsigned long unlocks; unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); - did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE); + did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP); /* * Make sure that a lock is always counted if the corresponding @@ -1205,7 +1205,7 @@ static bool srcu_should_expedite(struct srcu_struct *ssp) check_init_srcu_struct(ssp); /* If _lite() readers, don't do unsolicited expediting. */ - if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE) + if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP) return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); -- GitLab From f4bde41dd19db5e2ea9e0b4a19ac2573f7244d03 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 14:31:27 -0800 Subject: [PATCH 085/934] srcu: Pull pointer-to-integer conversion into __srcu_ptr_to_ctr() This commit abstracts the srcu_read_lock*() pointer-to-integer conversion into a new __srcu_ptr_to_ctr(). This will be used in rcutorture for testing an srcu_read_lock_fast() that returns a pointer rather than an integer. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 9 ++++++++- kernel/rcu/srcutree.c | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index e29cc57eac81d..f41bb3a55a048 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -211,6 +211,13 @@ void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); +// Converts a per-CPU pointer to an ->srcu_ctrs[] array element to that +// element's index. +static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __percpu *scpp) +{ + return scpp - &ssp->sda->srcu_ctrs[0]; +} + /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. Returns an index that must be passed to the matching @@ -228,7 +235,7 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); this_cpu_inc(scp->srcu_locks.counter); /* Y */ barrier(); /* Avoid leaking the critical section. */ - return scp - &ssp->sda->srcu_ctrs[0]; + return __srcu_ptr_to_ctr(ssp, scp); } /* diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 8b5c50bc98e5d..a91651866485d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -753,7 +753,7 @@ int __srcu_read_lock(struct srcu_struct *ssp) this_cpu_inc(scp->srcu_locks.counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return scp - &ssp->sda->srcu_ctrs[0]; + return __srcu_ptr_to_ctr(ssp, scp); } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -783,7 +783,7 @@ int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) atomic_long_inc(&scp->srcu_locks); smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ - return scpp - &ssp->sda->srcu_ctrs[0]; + return __srcu_ptr_to_ctr(ssp, scpp); } EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); -- GitLab From 4937096b579a36cfa5764a229d1a89542e10cf5b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 15:27:24 -0800 Subject: [PATCH 086/934] srcu: Pull integer-to-pointer conversion into __srcu_ctr_to_ptr() This commit abstracts the srcu_read_unlock*() integer-to-pointer conversion into a new __srcu_ctr_to_ptr(). This will be used in rcutorture for testing an srcu_read_unlock_fast() that avoids array-indexing overhead by taking a pointer rather than an integer. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 9 ++++++++- kernel/rcu/srcutree.c | 6 ++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index f41bb3a55a048..55fa400624bb4 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -218,6 +218,13 @@ static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __ return scpp - &ssp->sda->srcu_ctrs[0]; } +// Converts an integer to a per-CPU pointer to the corresponding +// ->srcu_ctrs[] array element. +static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ssp, int idx) +{ + return &ssp->sda->srcu_ctrs[idx]; +} + /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. Returns an index that must be passed to the matching @@ -252,7 +259,7 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) { barrier(); /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_unlocks.counter); /* Z */ + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); /* Z */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index a91651866485d..7a8ace83c98df 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -765,7 +765,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_unlocks.counter); + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -794,10 +794,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); */ void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) { - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ - atomic_long_inc(&sdp->srcu_ctrs[idx].srcu_unlocks); + atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks); } EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); -- GitLab From 443971156cebfc54a5f5c37a5702c13a2bb06755 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Jan 2025 09:39:35 -0800 Subject: [PATCH 087/934] srcu: Move SRCU Tree/Tiny definitions from srcu.h There are a couple of definitions under "#ifdef CONFIG_TINY_SRCU" in include/linux/srcu.h. There is no point in them being there, so this commit moves them to include/linux/srcutiny.h and include/linux/srcutree.c, thus eliminating that #ifdef. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 10 +--------- include/linux/srcutiny.h | 3 +++ include/linux/srcutree.h | 1 + 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 505f5bdce4446..2bd0e24e9b554 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -52,6 +52,7 @@ int init_srcu_struct(struct srcu_struct *ssp); #define SRCU_READ_FLAVOR_SLOWGP SRCU_READ_FLAVOR_LITE // Flavors requiring synchronize_rcu() // instead of smp_mb(). +void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); #ifdef CONFIG_TINY_SRCU #include @@ -64,15 +65,6 @@ int init_srcu_struct(struct srcu_struct *ssp); void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, void (*func)(struct rcu_head *head)); void cleanup_srcu_struct(struct srcu_struct *ssp); -int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); -void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); -#ifdef CONFIG_TINY_SRCU -#define __srcu_read_lock_lite __srcu_read_lock -#define __srcu_read_unlock_lite __srcu_read_unlock -#else // #ifdef CONFIG_TINY_SRCU -int __srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp); -void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) __releases(ssp); -#endif // #else // #ifdef CONFIG_TINY_SRCU void synchronize_srcu(struct srcu_struct *ssp); #define SRCU_GET_STATE_COMPLETED 0x1 diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index b347bde1aac29..a6194f7a7e34b 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -71,6 +71,9 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) return idx; } +#define __srcu_read_lock_lite __srcu_read_lock +#define __srcu_read_unlock_lite __srcu_read_unlock + static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) { synchronize_srcu(ssp); diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 55fa400624bb4..ef3065c0cadcd 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -207,6 +207,7 @@ struct srcu_struct { #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) +int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); -- GitLab From c4020620528e4e22a051900654a70dcff0ab218d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Jan 2025 13:19:42 -0800 Subject: [PATCH 088/934] srcu: Add SRCU-fast readers This commit adds srcu_read_{,un}lock_fast(), which is similar to srcu_read_{,un}lock_lite(), but avoids the array-indexing and pointer-following overhead. On a microbenchmark featuring tight loops around empty readers, this results in about a 20% speedup compared to RCU Tasks Trace on my x86 laptop. Please note that SRCU-fast has drawbacks compared to RCU Tasks Trace, including: o Lack of CPU stall warnings. o SRCU-fast readers permitted only where rcu_is_watching(). o A pointer-sized return value from srcu_read_lock_fast() must be passed to the corresponding srcu_read_unlock_fast(). o In the absence of readers, a synchronize_srcu() having _fast() readers will incur the latency of at least two normal RCU grace periods. o RCU Tasks Trace priority boosting could be easily added. Boosting SRCU readers is more difficult. SRCU-fast also has a drawback compared to SRCU-lite, namely that the return value from srcu_read_lock_fast()-fast is a 64-bit pointer and that from srcu_read_lock_lite() is only a 32-bit int. [ paulmck: Apply feedback from Akira Yokosawa. ] Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 47 ++++++++++++++++++++++++++++++++++++++-- include/linux/srcutiny.h | 22 +++++++++++++++++++ include/linux/srcutree.h | 38 ++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 2 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 2bd0e24e9b554..63bddc3014238 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -47,9 +47,10 @@ int init_srcu_struct(struct srcu_struct *ssp); #define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). #define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). #define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). +#define SRCU_READ_FLAVOR_FAST 0x8 // srcu_read_lock_fast(). #define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ - SRCU_READ_FLAVOR_LITE) // All of the above. -#define SRCU_READ_FLAVOR_SLOWGP SRCU_READ_FLAVOR_LITE + SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) // All of the above. +#define SRCU_READ_FLAVOR_SLOWGP (SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) // Flavors requiring synchronize_rcu() // instead of smp_mb(). void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); @@ -253,6 +254,33 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) return retval; } +/** + * srcu_read_lock_fast - register a new reader for an SRCU-protected structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter an SRCU read-side critical section, but for a light-weight + * smp_mb()-free reader. See srcu_read_lock() for more information. + * + * If srcu_read_lock_fast() is ever used on an srcu_struct structure, + * then none of the other flavors may be used, whether before, during, + * or after. Note that grace-period auto-expediting is disabled for _fast + * srcu_struct structures because auto-expedited grace periods invoke + * synchronize_rcu_expedited(), IPIs and all. + * + * Note that srcu_read_lock_fast() can be invoked only from those contexts + * where RCU is watching, that is, from contexts where it would be legal + * to invoke rcu_read_lock(). Otherwise, lockdep will complain. + */ +static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *ssp) __acquires(ssp) +{ + struct srcu_ctr __percpu *retval; + + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + retval = __srcu_read_lock_fast(ssp); + rcu_try_lock_acquire(&ssp->dep_map); + return retval; +} + /** * srcu_read_lock_lite - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. @@ -356,6 +384,21 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __srcu_read_unlock(ssp, idx); } +/** + * srcu_read_unlock_fast - unregister a old reader from an SRCU-protected structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @scp: return value from corresponding srcu_read_lock_fast(). + * + * Exit a light-weight SRCU read-side critical section. + */ +static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) + __releases(ssp) +{ + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); + srcu_lock_release(&ssp->dep_map); + __srcu_read_unlock_fast(ssp, scp); +} + /** * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index a6194f7a7e34b..e271f9f96bfce 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -71,6 +71,28 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) return idx; } +struct srcu_ctr; + +static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __percpu *scpp) +{ + return (int)(intptr_t)(struct srcu_ctr __force __kernel *)scpp; +} + +static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ssp, int idx) +{ + return (struct srcu_ctr __percpu *)(intptr_t)idx; +} + +static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp) +{ + return __srcu_ctr_to_ptr(ssp, __srcu_read_lock(ssp)); +} + +static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +{ + __srcu_read_unlock(ssp, __srcu_ptr_to_ctr(ssp, scp)); +} + #define __srcu_read_lock_lite __srcu_read_lock #define __srcu_read_unlock_lite __srcu_read_unlock diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index ef3065c0cadcd..bdc467efce3a2 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -226,6 +226,44 @@ static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ss return &ssp->sda->srcu_ctrs[idx]; } +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Returns a pointer that must be passed to the matching + * srcu_read_unlock_fast(). + * + * Note that this_cpu_inc() is an RCU read-side critical section either + * because it disables interrupts, because it is a single instruction, + * or because it is a read-modify-write atomic operation, depending on + * the whims of the architecture. + */ +static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp) +{ + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); + + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast()."); + this_cpu_inc(scp->srcu_locks.counter); /* Y */ + barrier(); /* Avoid leaking the critical section. */ + return scp; +} + +/* + * Removes the count for the old reader from the appropriate + * per-CPU element of the srcu_struct. Note that this may well be a + * different CPU than that which was incremented by the corresponding + * srcu_read_lock_fast(), but it must be within the same task. + * + * Note that this_cpu_inc() is an RCU read-side critical section either + * because it disables interrupts, because it is a single instruction, + * or because it is a read-modify-write atomic operation, depending on + * the whims of the architecture. + */ +static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +{ + barrier(); /* Avoid leaking the critical section. */ + this_cpu_inc(scp->srcu_unlocks.counter); /* Z */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); +} + /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. Returns an index that must be passed to the matching -- GitLab From 176d19eecb4821e541e68fdc57b2d7907b52cfd1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Jan 2025 13:24:44 -0800 Subject: [PATCH 089/934] rcutorture: Add ability to test srcu_read_{,un}lock_fast() This commit permits rcutorture to test srcu_read_{,un}lock_fast(), which is specified by the rcutorture.reader_flavor=0x8 kernel boot parameter. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1d2de50fb5d60..1bd3eaa0b8e7a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -677,6 +677,7 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) static int srcu_torture_read_lock(void) { int idx; + struct srcu_ctr __percpu *scp; int ret = 0; if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { @@ -694,6 +695,12 @@ static int srcu_torture_read_lock(void) WARN_ON_ONCE(idx & ~0x1); ret += idx << 2; } + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_read_lock_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 3; + } return ret; } @@ -719,6 +726,8 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) + srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); if (reader_flavor & SRCU_READ_FLAVOR_LITE) srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); if (reader_flavor & SRCU_READ_FLAVOR_NMI) -- GitLab From 4c3fca0f5990af9a3c15a2944854ce08c5937630 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Jan 2025 16:32:31 -0800 Subject: [PATCH 090/934] refscale: Add srcu_read_lock_fast() support using "srcu-fast" This commit creates a new srcu-fast option for the refscale.scale_type module parameter that selects srcu_read_lock_fast() and srcu_read_unlock_fast(). Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- kernel/rcu/refscale.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 1b47376acdc40..f11a7c2af778c 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -216,6 +216,36 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static void srcu_fast_ref_scale_read_section(const int nloops) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static const struct ref_scale_ops srcu_fast_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_fast_ref_scale_read_section, + .delaysection = srcu_fast_ref_scale_delay_section, + .name = "srcu-fast" +}; + static void srcu_lite_ref_scale_read_section(const int nloops) { int i; @@ -1163,7 +1193,7 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, -- GitLab From c143bac019152c4aa34b5fbd000df28ca818f2f1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Jan 2025 15:28:06 -0800 Subject: [PATCH 091/934] rcutorture: Make scenario SRCU-P use srcu_read_lock_fast() This commit causes the rcutorture SRCU-P scenario use the srcu_read_lock_fast() and srcu_read_unlock_fast() functions. This will cause these two functions to be regularly tested by several developers (myself included), for example, those who use torture.sh as an RCU acceptance test. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot index 2db39f298d182..fb61703690cb3 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot @@ -2,3 +2,4 @@ rcutorture.torture_type=srcud rcupdate.rcu_self_test=1 rcutorture.fwd_progress=3 srcutree.big_cpu_lim=5 +rcutorture.reader_flavor=0x8 -- GitLab From dfe442c943b7ab86f3eb2228e32179bda1402e96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Jan 2025 21:05:37 -0800 Subject: [PATCH 092/934] srcu: Fix srcu_read_unlock_{lite,nmisafe}() kernel-doc The srcu_read_unlock_lite() and srcu_read_unlock_nmisafe() both say that their idx parameters must come from srcu_read_lock(). This would be bad, because a given srcu_struct structure may be used only with one flavor of SRCU reader. This commit therefore updates the srcu_read_unlock_lite() kernel-doc header to say that its idx parameter must be obtained from srcu_read_lock_lite() and the srcu_read_unlock_nmisafe() kernel-doc header to say that its idx parameter must be obtained from srcu_read_lock_nmisafe(). Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/srcu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 63bddc3014238..a0df80baaccf3 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -402,7 +402,7 @@ static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ct /** * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. - * @idx: return value from corresponding srcu_read_lock(). + * @idx: return value from corresponding srcu_read_lock_lite(). * * Exit a light-weight SRCU read-side critical section. */ @@ -418,7 +418,7 @@ static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) /** * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. - * @idx: return value from corresponding srcu_read_lock(). + * @idx: return value from corresponding srcu_read_lock_nmisafe(). * * Exit an SRCU read-side critical section, but in an NMI-safe manner. */ -- GitLab From 729fb74889d94e6051d6ef2b21c769fe7e54f176 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Jan 2025 09:14:18 -0800 Subject: [PATCH 093/934] srcu: Document that srcu_{read_lock,down_read}() can share srcu_struct This commit adds a sentence to the srcu_down_read() function's kernel-doc header noting that it is permissible to use srcu_down_read() and srcu_read_lock() on the same srcu_struct, even concurrently. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index a0df80baaccf3..317eab82a5f0d 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -359,7 +359,8 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) * srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler. * * Calls to srcu_down_read() may be nested, similar to the manner in - * which calls to down_read() may be nested. + * which calls to down_read() may be nested. The same srcu_struct may be + * used concurrently by srcu_down_read() and srcu_read_lock(). */ static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp) { -- GitLab From f8b8df19b2dcb528e97395fdb74ca18b61db5207 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Jan 2025 16:06:58 -0800 Subject: [PATCH 094/934] srcu: Add srcu_down_read_fast() and srcu_up_read_fast() A pair of matching srcu_read_lock_fast() and srcu_read_unlock_fast() invocations must take place within the same context, for example, within the same task. Otherwise, lockdep complains, as is the right thing to do for most use cases. However, there are use cases involving tracing (for example, uretprobes) in which an SRCU reader needs to begin in one task and end in a timer handler, which might interrupt some other task. This commit therefore supplies the semaphore-like srcu_down_read_fast() and srcu_up_read_fast() functions, which act like srcu_read_lock_fast() and srcu_read_unlock_fast(), but permitting srcu_up_read_fast() to be invoked in a different context than was the matching srcu_down_read_fast(). Neither srcu_down_read_fast() nor srcu_up_read_fast() may be invoked from an NMI handler. Reported-by: Andrii Nakryiko Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 317eab82a5f0d..900b0d5c05f54 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -281,6 +281,24 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct * return retval; } +/** + * srcu_down_read_fast - register a new reader for an SRCU-protected structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter a semaphore-like SRCU read-side critical section, but for + * a light-weight smp_mb()-free reader. See srcu_read_lock_fast() and + * srcu_down_read() for more information. + * + * The same srcu_struct may be used concurrently by srcu_down_read_fast() + * and srcu_read_lock_fast(). + */ +static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp) +{ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + return __srcu_read_lock_fast(ssp); +} + /** * srcu_read_lock_lite - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. @@ -400,6 +418,22 @@ static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ct __srcu_read_unlock_fast(ssp, scp); } +/** + * srcu_up_read_fast - unregister a old reader from an SRCU-protected structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @scp: return value from corresponding srcu_read_lock_fast(). + * + * Exit an SRCU read-side critical section, but not necessarily from + * the same context as the maching srcu_down_read_fast(). + */ +static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) + __releases(ssp) +{ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); + __srcu_read_unlock_fast(ssp, scp); +} + /** * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. -- GitLab From 3cec27453db49a176e688b7721c3cd26be5ef835 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Jan 2025 18:32:49 -0800 Subject: [PATCH 095/934] srcu: Make SRCU-fast also be NMI-safe BPF uses rcu_read_lock_trace() in NMI context, so srcu_read_lock_fast() must be NMI-safe if it is to have any chance of addressing RCU Tasks Trace use cases. This commit therefore causes srcu_read_lock_fast() and srcu_read_unlock_fast() to use atomic_long_inc() instead of this_cpu_inc() on architectures that support NMIs but do not have NMI-safe implementations of this_cpu_inc(). Note that both x86 and arm64 have NMI-safe implementations of this_cpu_inc(), and thus do not pay the performance penalty inherent in atomic_inc_long(). It is tempting to use this trick to fold srcu_read_lock_nmisafe() into srcu_read_lock(), but this would need careful thought, review, and performance analysis. Though those smp_mb() calls might well make performance a non-issue. Reported-by: Alexei Starovoitov Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index bdc467efce3a2..8bed7e6cc4c1f 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -231,17 +231,24 @@ static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ss * srcu_struct. Returns a pointer that must be passed to the matching * srcu_read_unlock_fast(). * - * Note that this_cpu_inc() is an RCU read-side critical section either - * because it disables interrupts, because it is a single instruction, - * or because it is a read-modify-write atomic operation, depending on - * the whims of the architecture. + * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side + * critical sections either because they disables interrupts, because they + * are a single instruction, or because they are a read-modify-write atomic + * operation, depending on the whims of the architecture. + * + * This means that __srcu_read_lock_fast() is not all that fast + * on architectures that support NMIs but do not supply NMI-safe + * implementations of this_cpu_inc(). */ static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp) { struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast()."); - this_cpu_inc(scp->srcu_locks.counter); /* Y */ + if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + this_cpu_inc(scp->srcu_locks.counter); /* Y */ + else + atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks)); /* Z */ barrier(); /* Avoid leaking the critical section. */ return scp; } @@ -252,15 +259,22 @@ static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct * different CPU than that which was incremented by the corresponding * srcu_read_lock_fast(), but it must be within the same task. * - * Note that this_cpu_inc() is an RCU read-side critical section either - * because it disables interrupts, because it is a single instruction, - * or because it is a read-modify-write atomic operation, depending on - * the whims of the architecture. + * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side + * critical sections either because they disables interrupts, because they + * are a single instruction, or because they are a read-modify-write atomic + * operation, depending on the whims of the architecture. + * + * This means that __srcu_read_unlock_fast() is not all that fast + * on architectures that support NMIs but do not supply NMI-safe + * implementations of this_cpu_inc(). */ static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) { barrier(); /* Avoid leaking the critical section. */ - this_cpu_inc(scp->srcu_unlocks.counter); /* Z */ + if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + this_cpu_inc(scp->srcu_unlocks.counter); /* Z */ + else + atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); /* Z */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); } -- GitLab From 623b52802bb0b33bee28716002249aae5d89c5ec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Nov 2024 14:57:51 -0800 Subject: [PATCH 096/934] torture: Add get_torture_init_jiffies() for test-start time This commit adds a get_torture_init_jiffies() function that returns the value of the jiffies counter at the start of the test, that is, at the point where torture_init_begin() was invoked. This will be used to enable torture-test holdoffs for tests implemented using per-CPU kthreads, which are created and deleted by CPU-hotplug operations, and thus (unlike normal kthreads) don't automatically know when the test started. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/torture.h | 1 + kernel/torture.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/linux/torture.h b/include/linux/torture.h index 0134e7221cae6..1b59056c3b182 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -104,6 +104,7 @@ int torture_stutter_init(int s, int sgap); /* Initialization and cleanup. */ bool torture_init_begin(char *ttype, int v); void torture_init_end(void); +unsigned long get_torture_init_jiffies(void); bool torture_cleanup_begin(void); void torture_cleanup_end(void); bool torture_must_stop(void); diff --git a/kernel/torture.c b/kernel/torture.c index dede150aef011..3a0a8cc604010 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -792,6 +792,8 @@ static void torture_stutter_cleanup(void) stutter_task = NULL; } +static unsigned long torture_init_jiffies; + static void torture_print_module_parms(void) { @@ -821,6 +823,7 @@ bool torture_init_begin(char *ttype, int v) torture_type = ttype; verbose = v; fullstop = FULLSTOP_DONTSTOP; + WRITE_ONCE(torture_init_jiffies, jiffies); // Lockless reads. torture_print_module_parms(); return true; } @@ -836,6 +839,15 @@ void torture_init_end(void) } EXPORT_SYMBOL_GPL(torture_init_end); +/* + * Get the torture_init_begin()-time value of the jiffies counter. + */ +unsigned long get_torture_init_jiffies(void) +{ + return READ_ONCE(torture_init_jiffies); +} +EXPORT_SYMBOL_GPL(get_torture_init_jiffies); + /* * Clean up torture module. Please note that this is -not- invoked via * the usual module_exit() mechanism, but rather by an explicit call from -- GitLab From b8726c5aa6e8e0f4e8abc27f0c81db9f294958dc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Nov 2024 15:00:16 -0800 Subject: [PATCH 097/934] rcutorture: Add a test_boost_holdoff module parameter This commit adds a test_boost_holdoff module parameter that tells the RCU priority-boosting tests to wait for the specified number of seconds past the start of the rcutorture test. This can be useful when rcutorture is built into the kernel (as opposed to being modprobed), especially on large systems where early start of RCU priority boosting can delay the boot sequence, which adds a full CPU's worth of load onto the system. This can in turn result in pointless stall warnings. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- .../admin-guide/kernel-parameters.txt | 5 +++++ kernel/rcu/rcutorture.c | 19 ++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb8752b42ec85..ed1a0df03b187 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5758,6 +5758,11 @@ rcutorture.test_boost_duration= [KNL] Duration (s) of each individual boost test. + rcutorture.test_boost_holdoff= [KNL] + Holdoff time (s) from start of test to the start + of RCU priority-boost testing. Defaults to zero, + that is, no holdoff. + rcutorture.test_boost_interval= [KNL] Interval (s) between each boost test. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d26fb1d33ed9a..fbf1d7fcf61d0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -135,6 +135,7 @@ torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); +torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds."); torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); @@ -1148,8 +1149,19 @@ static int rcu_torture_boost(void *arg) unsigned long gp_state; unsigned long gp_state_time; unsigned long oldstarttime; + unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ; - VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) { + VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + } else { + VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period"); + while (time_before(jiffies, booststarttime)) { + schedule_timeout_idle(HZ); + if (kthread_should_stop()) + goto cleanup; + } + VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period"); + } /* Set real-time priority. */ sched_set_fifo_low(current); @@ -1225,6 +1237,7 @@ checkwait: if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); } while (!torture_must_stop()); +cleanup: /* Clean up and exit. */ while (!kthread_should_stop()) { torture_shutdown_absorb("rcu_torture_boost"); @@ -2512,7 +2525,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "shuffle_interval=%d stutter=%d irqreader=%d " "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " + "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "stall_cpu_block=%d stall_cpu_repeat=%d " "n_barrier_cbs=%d " @@ -2526,7 +2539,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, + test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, stall_cpu_block, stall_cpu_repeat, n_barrier_cbs, -- GitLab From 84ae91018af56184afabb1bc08b5c117a0634e5e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Nov 2024 13:55:32 -0800 Subject: [PATCH 098/934] rcutorture: Include grace-period sequence numbers in failure/close-call This commit includes the grace-period sequence numbers at the beginning and end of each segment in the "Failure/close-call rcutorture reader segments" list. These are in hexadecimal, and only the bottom byte. Currently, only RCU is supported, with its three sequence numbers (normal, expedited, and polled). Note that if all the grace-period sequence numbers remain the same across a given reader segment, only one copy of the number will be printed. Of course, if there is a change, both sets of values will be printed. Because the overhead of collecting this information can suppress heisenbugs, this information is collected and printed only in kernels built with CONFIG_RCU_TORTURE_TEST_LOG_GP=y. [ paulmck: Apply Nathan Chancellor feedback for IS_ENABLED(). ] [ paulmck: Apply feedback from kernel test robot. ] Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Signed-off-by: Boqun Feng --- kernel/rcu/Kconfig.debug | 14 ++++++++++++++ kernel/rcu/rcu.h | 2 ++ kernel/rcu/rcutorture.c | 34 ++++++++++++++++++++++++++++++++++ kernel/rcu/tiny.c | 14 ++++++++++++++ kernel/rcu/tree.c | 20 ++++++++++++++++++++ 5 files changed, 84 insertions(+) diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 6af90510a1ca7..25a9dc2be0dcd 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -84,6 +84,20 @@ config RCU_TORTURE_TEST_LOG_CPU Say Y here if you want CPU IDs logged. Say N if you are unsure. +config RCU_TORTURE_TEST_LOG_GP + bool "Log grace-period numbers for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + corresponding grace-period sequence numbers. This information + can be useful, but it does incur additional overhead, overhead + that can make both failures and close calls less probable. + + Say Y here if you want grace-period sequence numbers logged. + Say N if you are unsure. + config RCU_REF_SCALE_TEST tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index feb3ac1dc5d59..a6098997a14ba 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -590,6 +590,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #endif static inline void rcu_gp_set_torture_wait(int duration) { } #endif +unsigned long rcutorture_gather_gp_seqs(void); +void rcutorture_format_gp_seqs(unsigned long seqs, char *cp); #ifdef CONFIG_TINY_SRCU diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index fbf1d7fcf61d0..2113583cae347 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -273,6 +273,8 @@ struct rt_read_seg { bool rt_preempted; int rt_cpu; int rt_end_cpu; + unsigned long rt_gp_seq; + unsigned long rt_gp_seq_end; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; @@ -407,6 +409,8 @@ struct rcu_torture_ops { void (*gp_slow_register)(atomic_t *rgssp); void (*gp_slow_unregister)(atomic_t *rgssp); bool (*reader_blocked)(void); + unsigned long (*gather_gp_seqs)(void); + void (*format_gp_seqs)(unsigned long seqs, char *cp); long cbflood_max; int irq_capable; int can_boost; @@ -611,6 +615,8 @@ static struct rcu_torture_ops rcu_ops = { .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU) ? has_rcu_reader_blocked : NULL, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -656,6 +662,8 @@ static struct rcu_torture_ops rcu_busted_ops = { .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, .call = call_rcu_busted, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted" @@ -1978,6 +1986,12 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, rtrsp[-1].rt_preempted = cur_ops->reader_blocked(); } } + // Sample grace-period sequence number, as good a place as any. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) { + rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs(); + if (!first) + rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq; + } /* * Next, remove old protection, in decreasing order of strength @@ -3566,6 +3580,7 @@ rcu_torture_cleanup(void) int flags = 0; unsigned long gp_seq = 0; int i; + int j; if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) { @@ -3661,6 +3676,25 @@ rcu_torture_cleanup(void) else pr_cont(" ..."); } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && + cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) { + char buf1[16+1]; + char buf2[16+1]; + char sepchar = '-'; + + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, buf1); + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end, buf2); + if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) { + if (buf2[0]) { + for (j = 0; buf2[j]; j++) + buf2[j] = '.'; + if (j) + buf2[j - 1] = ' '; + } + sepchar = ' '; + } + pr_cont(" %s%c%s", buf1, sepchar, buf2); + } if (err_segs[i].rt_delay_ms != 0) { pr_cont(" %s%ldms", firsttime ? "" : "+", err_segs[i].rt_delay_ms); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4b3f319114650..f9c4a24dc59c2 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -257,6 +257,20 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) EXPORT_SYMBOL_GPL(kvfree_call_rcu); #endif +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) +unsigned long rcutorture_gather_gp_seqs(void) +{ + return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xff; +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); + +void rcutorture_format_gp_seqs(unsigned long seqs, char *cp) +{ + snprintf(cp, 8, "g%02lx", seqs & 0xff); +} +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); +#endif + void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 475f31deed141..e40c4b5c3267a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -538,6 +538,26 @@ void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); +/* Gather grace-period sequence numbers for rcutorture diagnostics. */ +unsigned long rcutorture_gather_gp_seqs(void) +{ + return ((READ_ONCE(rcu_state.gp_seq) & 0xff) << 16) | + ((READ_ONCE(rcu_state.expedited_sequence) & 0xff) << 8) | + (READ_ONCE(rcu_state.gp_seq_polled) & 0xff); +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); + +/* Format grace-period sequence numbers for rcutorture diagnostics. */ +void rcutorture_format_gp_seqs(unsigned long seqs, char *cp) +{ + unsigned int egp = (seqs >> 8) & 0xff; + unsigned int ggp = (seqs >> 16) & 0xff; + unsigned int pgp = seqs & 0xff; + + snprintf(cp, 16, "g%02x:e%02x:p%02x", ggp, egp, pgp); +} +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); + #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on -- GitLab From 2db7ab8c108669d0b7d87c617edf0a8e132bd1c7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Dec 2024 19:47:44 -0800 Subject: [PATCH 099/934] rcutorture: Expand failure/close-call grace-period output With only eight bits per grace-period sequence number, wrap can happen in 64 grace periods. This commit therefore increases this to sixteen bits for normal grace-period sequence numbers and the combined short-form polling sequence numbers, thus deferring wrap for at least 16,384 grace periods. Because expedited grace periods go faster, expand these to 24 bits, deferring wrap for at least 4,194,304 expedited grace periods. These longer wrap times makes it easier to correlate these numbers to trace-event output. Note that the low-order two bits are reserved for intra-grace-period state, hence the above wrap numbers being a factor of four smaller than you might expect. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcu.h | 4 ++-- kernel/rcu/rcutorture.c | 12 ++++++------ kernel/rcu/tiny.c | 8 ++++---- kernel/rcu/tree.c | 18 +++++++++--------- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index a6098997a14ba..705fcbe6f5001 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -590,8 +590,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #endif static inline void rcu_gp_set_torture_wait(int duration) { } #endif -unsigned long rcutorture_gather_gp_seqs(void); -void rcutorture_format_gp_seqs(unsigned long seqs, char *cp); +unsigned long long rcutorture_gather_gp_seqs(void); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp); #ifdef CONFIG_TINY_SRCU diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2113583cae347..fb1b80498ae01 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -273,8 +273,8 @@ struct rt_read_seg { bool rt_preempted; int rt_cpu; int rt_end_cpu; - unsigned long rt_gp_seq; - unsigned long rt_gp_seq_end; + unsigned long long rt_gp_seq; + unsigned long long rt_gp_seq_end; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; @@ -409,8 +409,8 @@ struct rcu_torture_ops { void (*gp_slow_register)(atomic_t *rgssp); void (*gp_slow_unregister)(atomic_t *rgssp); bool (*reader_blocked)(void); - unsigned long (*gather_gp_seqs)(void); - void (*format_gp_seqs)(unsigned long seqs, char *cp); + unsigned long long (*gather_gp_seqs)(void); + void (*format_gp_seqs)(unsigned long long seqs, char *cp); long cbflood_max; int irq_capable; int can_boost; @@ -3678,8 +3678,8 @@ rcu_torture_cleanup(void) } if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) { - char buf1[16+1]; - char buf2[16+1]; + char buf1[20+1]; + char buf2[20+1]; char sepchar = '-'; cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, buf1); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index f9c4a24dc59c2..8cbec34011844 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -258,15 +258,15 @@ EXPORT_SYMBOL_GPL(kvfree_call_rcu); #endif #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) -unsigned long rcutorture_gather_gp_seqs(void) +unsigned long long rcutorture_gather_gp_seqs(void) { - return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xff; + return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL; } EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); -void rcutorture_format_gp_seqs(unsigned long seqs, char *cp) +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp) { - snprintf(cp, 8, "g%02lx", seqs & 0xff); + snprintf(cp, 8, "g%04llx", seqs & 0xffffULL); } EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); #endif diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e40c4b5c3267a..83cba3d2cc487 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -539,22 +539,22 @@ void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); /* Gather grace-period sequence numbers for rcutorture diagnostics. */ -unsigned long rcutorture_gather_gp_seqs(void) +unsigned long long rcutorture_gather_gp_seqs(void) { - return ((READ_ONCE(rcu_state.gp_seq) & 0xff) << 16) | - ((READ_ONCE(rcu_state.expedited_sequence) & 0xff) << 8) | - (READ_ONCE(rcu_state.gp_seq_polled) & 0xff); + return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) | + ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) | + (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL); } EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); /* Format grace-period sequence numbers for rcutorture diagnostics. */ -void rcutorture_format_gp_seqs(unsigned long seqs, char *cp) +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp) { - unsigned int egp = (seqs >> 8) & 0xff; - unsigned int ggp = (seqs >> 16) & 0xff; - unsigned int pgp = seqs & 0xff; + unsigned int egp = (seqs >> 16) & 0xffffffULL; + unsigned int ggp = (seqs >> 40) & 0xffffULL; + unsigned int pgp = seqs & 0xffffULL; - snprintf(cp, 16, "g%02x:e%02x:p%02x", ggp, egp, pgp); + snprintf(cp, 20, "g%04x:e%06x:p%04x", ggp, egp, pgp); } EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); -- GitLab From a8f7c9c4576de902327cff5890d0fe882fb9bada Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Dec 2024 20:33:50 -0800 Subject: [PATCH 100/934] rcu: Trace expedited grace-period numbers in hexadecimal This commit reformats the expedited grace-period numbers into hexadecimal for easier decoding and comparison. The normal grace-period numbers remain in decimal for the time being. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/trace/events/rcu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index e81431deaa50b..63fd8aa99af7c 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -207,7 +207,7 @@ TRACE_EVENT_RCU(rcu_exp_grace_period, __entry->gpevent = gpevent; ), - TP_printk("%s %ld %s", + TP_printk("%s %#lx %s", __entry->rcuname, __entry->gpseq, __entry->gpevent) ); -- GitLab From 65e6ff0f31184bd9ce01c7bfef28558e6b70f96a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Dec 2024 14:41:55 -0800 Subject: [PATCH 101/934] rcutorture: Add ftrace-compatible timestamp to GP# failure/close-call output This commit adds an ftrace-compatible microsecond-scale timestamp to the failure/close-call output, but only in kernels built with CONFIG_RCU_TORTURE_TEST_LOG_GP=y. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index fb1b80498ae01..1fdadc1df9adb 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -275,6 +275,7 @@ struct rt_read_seg { int rt_end_cpu; unsigned long long rt_gp_seq; unsigned long long rt_gp_seq_end; + u64 rt_ts; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; @@ -1989,6 +1990,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, // Sample grace-period sequence number, as good a place as any. if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) { rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs(); + rtrsp->rt_ts = ktime_get_mono_fast_ns(); if (!first) rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq; } @@ -3663,7 +3665,11 @@ rcu_torture_cleanup(void) pr_alert("\t: No segments recorded!!!\n"); firsttime = 1; for (i = 0; i < rt_read_nsegs; i++) { - pr_alert("\t%d: %#4x", i, err_segs[i].rt_readstate); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP)) + pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL)); + else + pr_alert("\t"); + pr_cont("%d: %#4x", i, err_segs[i].rt_readstate); if (err_segs[i].rt_delay_jiffies != 0) { pr_cont("%s%ldjiffies", firsttime ? "" : "+", err_segs[i].rt_delay_jiffies); -- GitLab From 7acc2d90151fe6f5d8409df44e10cd24a0296e9f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Dec 2024 10:23:06 -0800 Subject: [PATCH 102/934] rcutorture: Make cur_ops->format_gp_seqs take buffer length The Tree and Tiny implementations of rcutorture_format_gp_seqs() use hard-coded constants for the length of the buffer that they format into. This is of course an accident waiting to happen, so this commit therefore makes them take a length argument. The rcutorture calling code uses ARRAY_SIZE() to safely compute this new argument. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcu.h | 2 +- kernel/rcu/rcutorture.c | 8 +++++--- kernel/rcu/tiny.c | 4 ++-- kernel/rcu/tree.c | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 705fcbe6f5001..82d8b494cc30d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -591,7 +591,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, static inline void rcu_gp_set_torture_wait(int duration) { } #endif unsigned long long rcutorture_gather_gp_seqs(void); -void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len); #ifdef CONFIG_TINY_SRCU diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1fdadc1df9adb..9c9a349b9c7f8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -411,7 +411,7 @@ struct rcu_torture_ops { void (*gp_slow_unregister)(atomic_t *rgssp); bool (*reader_blocked)(void); unsigned long long (*gather_gp_seqs)(void); - void (*format_gp_seqs)(unsigned long long seqs, char *cp); + void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len); long cbflood_max; int irq_capable; int can_boost; @@ -3688,8 +3688,10 @@ rcu_torture_cleanup(void) char buf2[20+1]; char sepchar = '-'; - cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, buf1); - cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end, buf2); + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, + buf1, ARRAY_SIZE(buf1)); + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end, + buf2, ARRAY_SIZE(buf2)); if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) { if (buf2[0]) { for (j = 0; buf2[j]; j++) diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 8cbec34011844..8a52aca686a52 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -264,9 +264,9 @@ unsigned long long rcutorture_gather_gp_seqs(void) } EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); -void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp) +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) { - snprintf(cp, 8, "g%04llx", seqs & 0xffffULL); + snprintf(cp, len, "g%04llx", seqs & 0xffffULL); } EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); #endif diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 83cba3d2cc487..bb061e5870c35 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -548,13 +548,13 @@ unsigned long long rcutorture_gather_gp_seqs(void) EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); /* Format grace-period sequence numbers for rcutorture diagnostics. */ -void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp) +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) { unsigned int egp = (seqs >> 16) & 0xffffffULL; unsigned int ggp = (seqs >> 40) & 0xffffULL; unsigned int pgp = seqs & 0xffffULL; - snprintf(cp, 20, "g%04x:e%06x:p%04x", ggp, egp, pgp); + snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp); } EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); -- GitLab From 5d45bdf292e62dda86b4f0a5d456287d22a0d2b5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Dec 2024 09:48:06 -0800 Subject: [PATCH 103/934] rcutorture: Move RCU_TORTURE_TEST_{CHK_RDR_STATE,LOG_CPU} to bool The RCU_TORTURE_TEST_CHK_RDR_STATE and RCU_TORTURE_TEST_LOG_CPU Kconfig options are pointlessly defined as tristate. This commit therefore converts them to bool. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202412241458.150d082b-lkp@intel.com Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/Kconfig.debug | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 25a9dc2be0dcd..12e4c64ebae15 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -54,7 +54,7 @@ config RCU_TORTURE_TEST Say N if you are unsure. config RCU_TORTURE_TEST_CHK_RDR_STATE - tristate "Check rcutorture reader state" + bool "Check rcutorture reader state" depends on RCU_TORTURE_TEST default n help @@ -70,7 +70,7 @@ config RCU_TORTURE_TEST_CHK_RDR_STATE Say N if you are unsure. config RCU_TORTURE_TEST_LOG_CPU - tristate "Log CPU for rcutorture failures" + bool "Log CPU for rcutorture failures" depends on RCU_TORTURE_TEST default n help -- GitLab From 38b43eca6665434225284c0b583a39340f4e1f37 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Jan 2025 10:14:53 -0800 Subject: [PATCH 104/934] rcutorture: Complain when invalid SRCU reader_flavor is specified Currently, rcutorture ignores reader_flavor bits that are not in the SRCU_READ_FLAVOR_ALL bitmask, which could confuse rcutorture users into believing buggy patches had been fully tested. This commit therefore produces a splat in this case. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9c9a349b9c7f8..be4e3c6b912fe 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -689,6 +689,8 @@ static int srcu_torture_read_lock(void) int idx; int ret = 0; + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { idx = srcu_read_lock(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); -- GitLab From 536e8b9b80bc7a0a8e87af2d5fb7fe3e230669ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Jan 2025 11:26:23 -0800 Subject: [PATCH 105/934] srcu: Add FORCE_NEED_SRCU_NMI_SAFE Kconfig for testing The srcu_read_lock_nmisafe() and srcu_read_unlock_nmisafe() functions map to __srcu_read_lock() and __srcu_read_unlock() on systems like x86 that have NMI-safe this_cpu_inc() operations. This makes the underlying __srcu_read_lock_nmisafe() and __srcu_read_unlock_nmisafe() functions difficult to test on (for example) x86 systems, allowing bugs to creep in. This commit therefore creates a FORCE_NEED_SRCU_NMI_SAFE Kconfig that forces those underlying functions to be used even on systems where they are not needed, thus providing better testing coverage. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/Kconfig | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b9b6bc55185db..c8e540af3a353 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -65,6 +65,17 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config FORCE_NEED_SRCU_NMI_SAFE + bool "Force selection of NEED_SRCU_NMI_SAFE" + depends on !TINY_SRCU + select NEED_SRCU_NMI_SAFE + default n + help + This option forces selection of the NEED_SRCU_NMI_SAFE + Kconfig option, allowing testing of srcu_read_lock_nmisafe() + and srcu_read_unlock_nmisafe() on architectures (like x86) + that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option. + config NEED_SRCU_NMI_SAFE def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU -- GitLab From 6be43acb2a6d2b20038616eed9bc08ef0f962a77 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Jan 2025 14:34:28 -0800 Subject: [PATCH 106/934] torture: Make SRCU lockdep testing use srcu_read_lock_nmisafe() Recent experience shows that the srcu_read_lock_nmisafe() and srcu_read_unlock_nmisafe() functions are not sufficiently tested. This commit therefore causes the torture.sh script's SRCU lockdep testing to use these two functions. This will cause these two functions to be regularly tested by several developers (myself included) who use torture.sh as an RCU acceptance test. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh b/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh index 2e63ef009d593..2db12c5cad9c6 100755 --- a/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh +++ b/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh @@ -49,7 +49,7 @@ do do err= val=$((d*1000+t*10+c)) - tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 5s --configs "SRCU-P" --bootargs "rcutorture.test_srcu_lockdep=$val" --trust-make --datestamp "$ds/$val" > "$T/kvm.sh.out" 2>&1 + tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 5s --configs "SRCU-P" --kconfig "CONFIG_FORCE_NEED_SRCU_NMI_SAFE=y" --bootargs "rcutorture.test_srcu_lockdep=$val rcutorture.reader_flavor=0x2" --trust-make --datestamp "$ds/$val" > "$T/kvm.sh.out" 2>&1 ret=$? mv "$T/kvm.sh.out" "$RCUTORTURE/res/$ds/$val" if test "$d" -ne 0 && test "$ret" -eq 0 -- GitLab From 0f46d81f2bce970b1c562aa3c944a271bbec2729 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 29 Jan 2025 17:58:00 +0100 Subject: [PATCH 107/934] fanotify: notify on mount attach and detach Add notifications for attaching and detaching mounts. The following new event masks are added: FAN_MNT_ATTACH - Mount was attached FAN_MNT_DETACH - Mount was detached If a mount is moved, then the event is reported with (FAN_MNT_ATTACH | FAN_MNT_DETACH). These events add an info record of type FAN_EVENT_INFO_TYPE_MNT containing these fields identifying the affected mounts: __u64 mnt_id - the ID of the mount (see statmount(2)) FAN_REPORT_MNT must be supplied to fanotify_init() to receive these events and no other type of event can be received with this report type. Marks are added with FAN_MARK_MNTNS, which records the mount namespace from an nsfs file (e.g. /proc/self/ns/mnt). Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20250129165803.72138-3-mszeredi@redhat.com Signed-off-by: Christian Brauner --- fs/mount.h | 2 + fs/namespace.c | 14 ++++- fs/notify/fanotify/fanotify.c | 38 ++++++++++++- fs/notify/fanotify/fanotify.h | 18 ++++++ fs/notify/fanotify/fanotify_user.c | 89 +++++++++++++++++++++++++----- fs/notify/fdinfo.c | 5 ++ include/linux/fanotify.h | 12 ++-- include/uapi/linux/fanotify.h | 10 ++++ 8 files changed, 164 insertions(+), 24 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 82aa3bad7cf5a..5324a931b4034 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -181,3 +181,5 @@ static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); } + +struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); diff --git a/fs/namespace.c b/fs/namespace.c index a3ed3f2980cba..bc77b2e0130dc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2145,16 +2145,24 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr } } +struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry) +{ + if (!is_mnt_ns_file(dentry)) + return NULL; + + return to_mnt_ns(get_proc_ns(dentry->d_inode)); +} + static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a * mount namespace loop? */ - struct mnt_namespace *mnt_ns; - if (!is_mnt_ns_file(dentry)) + struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry); + + if (!mnt_ns) return false; - mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 95646f7c46cab..6d386080faf22 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -166,6 +166,8 @@ static bool fanotify_should_merge(struct fanotify_event *old, case FANOTIFY_EVENT_TYPE_FS_ERROR: return fanotify_error_event_equal(FANOTIFY_EE(old), FANOTIFY_EE(new)); + case FANOTIFY_EVENT_TYPE_MNT: + return false; default: WARN_ON_ONCE(1); } @@ -312,7 +314,10 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", __func__, iter_info->report_mask, event_mask, data, data_type); - if (!fid_mode) { + if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) { + if (data_type != FSNOTIFY_EVENT_MNT) + return 0; + } else if (!fid_mode) { /* Do we have path to open a file descriptor? */ if (!path) return 0; @@ -557,6 +562,20 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path, return &pevent->fae; } +static struct fanotify_event *fanotify_alloc_mnt_event(u64 mnt_id, gfp_t gfp) +{ + struct fanotify_mnt_event *pevent; + + pevent = kmem_cache_alloc(fanotify_mnt_event_cachep, gfp); + if (!pevent) + return NULL; + + pevent->fae.type = FANOTIFY_EVENT_TYPE_MNT; + pevent->mnt_id = mnt_id; + + return &pevent->fae; +} + static struct fanotify_event *fanotify_alloc_perm_event(const void *data, int data_type, gfp_t gfp) @@ -731,6 +750,7 @@ static struct fanotify_event *fanotify_alloc_event( fid_mode); struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); + u64 mnt_id = fsnotify_data_mnt_id(data, data_type); struct mem_cgroup *old_memcg; struct dentry *moved = NULL; struct inode *child = NULL; @@ -826,8 +846,12 @@ static struct fanotify_event *fanotify_alloc_event( moved, &hash, gfp); } else if (fid_mode) { event = fanotify_alloc_fid_event(id, fsid, &hash, gfp); - } else { + } else if (path) { event = fanotify_alloc_path_event(path, &hash, gfp); + } else if (mnt_id) { + event = fanotify_alloc_mnt_event(mnt_id, gfp); + } else { + WARN_ON_ONCE(1); } if (!event) @@ -927,7 +951,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_RENAME != FS_RENAME); BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 24); mask = fanotify_group_event_mask(group, iter_info, &match_mask, mask, data, data_type, dir); @@ -1028,6 +1052,11 @@ static void fanotify_free_error_event(struct fsnotify_group *group, mempool_free(fee, &group->fanotify_data.error_events_pool); } +static void fanotify_free_mnt_event(struct fanotify_event *event) +{ + kmem_cache_free(fanotify_mnt_event_cachep, FANOTIFY_ME(event)); +} + static void fanotify_free_event(struct fsnotify_group *group, struct fsnotify_event *fsn_event) { @@ -1054,6 +1083,9 @@ static void fanotify_free_event(struct fsnotify_group *group, case FANOTIFY_EVENT_TYPE_FS_ERROR: fanotify_free_error_event(group, event); break; + case FANOTIFY_EVENT_TYPE_MNT: + fanotify_free_mnt_event(event); + break; default: WARN_ON_ONCE(1); } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index c12cbc270539c..b44e70e44be65 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -9,6 +9,7 @@ extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_fid_event_cachep; extern struct kmem_cache *fanotify_path_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; +extern struct kmem_cache *fanotify_mnt_event_cachep; /* Possible states of the permission event */ enum { @@ -244,6 +245,7 @@ enum fanotify_event_type { FANOTIFY_EVENT_TYPE_PATH_PERM, FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */ FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */ + FANOTIFY_EVENT_TYPE_MNT, __FANOTIFY_EVENT_TYPE_NUM }; @@ -409,12 +411,23 @@ struct fanotify_path_event { struct path path; }; +struct fanotify_mnt_event { + struct fanotify_event fae; + u64 mnt_id; +}; + static inline struct fanotify_path_event * FANOTIFY_PE(struct fanotify_event *event) { return container_of(event, struct fanotify_path_event, fae); } +static inline struct fanotify_mnt_event * +FANOTIFY_ME(struct fanotify_event *event) +{ + return container_of(event, struct fanotify_mnt_event, fae); +} + /* * Structure for permission fanotify events. It gets allocated and freed in * fanotify_handle_event() since we wait there for user response. When the @@ -466,6 +479,11 @@ static inline bool fanotify_is_error_event(u32 mask) return mask & FAN_FS_ERROR; } +static inline bool fanotify_is_mnt_event(u32 mask) +{ + return mask & (FAN_MNT_ATTACH | FAN_MNT_DETACH); +} + static inline const struct path *fanotify_event_path(struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_PATH) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index ba3e2d09eb44e..f2d840ae4ded8 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -113,6 +113,7 @@ struct kmem_cache *fanotify_mark_cache __ro_after_init; struct kmem_cache *fanotify_fid_event_cachep __ro_after_init; struct kmem_cache *fanotify_path_event_cachep __ro_after_init; struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; +struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init; #define FANOTIFY_EVENT_ALIGN 4 #define FANOTIFY_FID_INFO_HDR_LEN \ @@ -123,6 +124,8 @@ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; (sizeof(struct fanotify_event_info_error)) #define FANOTIFY_RANGE_INFO_LEN \ (sizeof(struct fanotify_event_info_range)) +#define FANOTIFY_MNT_INFO_LEN \ + (sizeof(struct fanotify_event_info_mnt)) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -178,6 +181,8 @@ static size_t fanotify_event_len(unsigned int info_mode, fh_len = fanotify_event_object_fh_len(event); event_len += fanotify_fid_info_len(fh_len, dot_len); } + if (fanotify_is_mnt_event(event->mask)) + event_len += FANOTIFY_MNT_INFO_LEN; if (info_mode & FAN_REPORT_PIDFD) event_len += FANOTIFY_PIDFD_INFO_LEN; @@ -405,6 +410,25 @@ static int process_access_response(struct fsnotify_group *group, return -ENOENT; } +static size_t copy_mnt_info_to_user(struct fanotify_event *event, + char __user *buf, int count) +{ + struct fanotify_event_info_mnt info = { }; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT; + info.hdr.len = FANOTIFY_MNT_INFO_LEN; + + if (WARN_ON(count < info.hdr.len)) + return -EFAULT; + + info.mnt_id = FANOTIFY_ME(event)->mnt_id; + + if (copy_to_user(buf, &info, sizeof(info))) + return -EFAULT; + + return info.hdr.len; +} + static size_t copy_error_info_to_user(struct fanotify_event *event, char __user *buf, int count) { @@ -700,6 +724,15 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } + if (fanotify_is_mnt_event(event->mask)) { + ret = copy_mnt_info_to_user(event, buf, count); + if (ret < 0) + return ret; + buf += ret; + count -= ret; + total_bytes += ret; + } + return total_bytes; } @@ -1508,6 +1541,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) return -EINVAL; + /* Don't allow mixing mnt events with inode events for now */ + if (flags & FAN_REPORT_MNT) { + if (class != FAN_CLASS_NOTIF) + return -EINVAL; + if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR)) + return -EINVAL; + } + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) return -EINVAL; @@ -1767,7 +1808,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, int dfd, const char __user *pathname) { struct inode *inode = NULL; - struct vfsmount *mnt = NULL; struct fsnotify_group *group; struct path path; struct fan_fsid __fsid, *fsid = NULL; @@ -1776,7 +1816,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; - void *obj; + void *obj = NULL; u32 umask = 0; int ret; @@ -1800,6 +1840,9 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, case FAN_MARK_FILESYSTEM: obj_type = FSNOTIFY_OBJ_TYPE_SB; break; + case FAN_MARK_MNTNS: + obj_type = FSNOTIFY_OBJ_TYPE_MNTNS; + break; default: return -EINVAL; } @@ -1847,6 +1890,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; group = fd_file(f)->private_data; + /* Only report mount events on mnt namespace */ + if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) { + if (mask & ~FANOTIFY_MOUNT_EVENTS) + return -EINVAL; + if (mark_type != FAN_MARK_MNTNS) + return -EINVAL; + } else { + if (mask & FANOTIFY_MOUNT_EVENTS) + return -EINVAL; + if (mark_type == FAN_MARK_MNTNS) + return -EINVAL; + } + /* * An unprivileged user is not allowed to setup mount nor filesystem * marks. This also includes setting up such marks by a group that @@ -1888,7 +1944,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * point. */ fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); - if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) && + if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) && (!fid_mode || mark_type == FAN_MARK_MOUNT)) return -EINVAL; @@ -1938,17 +1994,21 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } /* inode held in place by reference to path; group by fget on fd */ - if (mark_type == FAN_MARK_INODE) { + if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) { inode = path.dentry->d_inode; obj = inode; - } else { - mnt = path.mnt; - if (mark_type == FAN_MARK_MOUNT) - obj = mnt; - else - obj = mnt->mnt_sb; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + obj = path.mnt; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) { + obj = path.mnt->mnt_sb; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) { + obj = mnt_ns_from_dentry(path.dentry); } + ret = -EINVAL; + if (!obj) + goto path_put_and_out; + /* * If some other task has this inode open for write we should not add * an ignore mask, unless that ignore mask is supposed to survive @@ -1956,10 +2016,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, */ if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) && !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) { - ret = mnt ? -EINVAL : -EISDIR; + ret = !inode ? -EINVAL : -EISDIR; /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ if (ignore == FAN_MARK_IGNORE && - (mnt || S_ISDIR(inode->i_mode))) + (!inode || S_ISDIR(inode->i_mode))) goto path_put_and_out; ret = 0; @@ -1968,7 +2028,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ - if (mnt || !S_ISDIR(inode->i_mode)) { + if (!inode || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; umask = FAN_EVENT_ON_CHILD; /* @@ -2042,7 +2102,7 @@ static int __init fanotify_user_setup(void) FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); fanotify_mark_cache = KMEM_CACHE(fanotify_mark, @@ -2055,6 +2115,7 @@ static int __init fanotify_user_setup(void) fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); } + fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC); fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index e933f9c65d904..1161eabf11ee1 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -121,6 +121,11 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n", sb->s_dev, mflags, mark->mask, mark->ignore_mask); + } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_MNTNS) { + struct mnt_namespace *mnt_ns = fsnotify_conn_mntns(mark->connector); + + seq_printf(m, "fanotify mnt_ns:%u mflags:%x mask:%x ignored_mask:%x\n", + mnt_ns->ns.inum, mflags, mark->mask, mark->ignore_mask); } } diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 78f660ebc3180..3c817dc6292ea 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -25,7 +25,7 @@ #define FANOTIFY_FID_BITS (FAN_REPORT_DFID_NAME_TARGET) -#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD) +#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD | FAN_REPORT_MNT) /* * fanotify_init() flags that require CAP_SYS_ADMIN. @@ -38,7 +38,8 @@ FAN_REPORT_PIDFD | \ FAN_REPORT_FD_ERROR | \ FAN_UNLIMITED_QUEUE | \ - FAN_UNLIMITED_MARKS) + FAN_UNLIMITED_MARKS | \ + FAN_REPORT_MNT) /* * fanotify_init() flags that are allowed for user without CAP_SYS_ADMIN. @@ -58,7 +59,7 @@ #define FANOTIFY_INTERNAL_GROUP_FLAGS (FANOTIFY_UNPRIV) #define FANOTIFY_MARK_TYPE_BITS (FAN_MARK_INODE | FAN_MARK_MOUNT | \ - FAN_MARK_FILESYSTEM) + FAN_MARK_FILESYSTEM | FAN_MARK_MNTNS) #define FANOTIFY_MARK_CMD_BITS (FAN_MARK_ADD | FAN_MARK_REMOVE | \ FAN_MARK_FLUSH) @@ -109,10 +110,13 @@ /* Events that can only be reported with data type FSNOTIFY_EVENT_ERROR */ #define FANOTIFY_ERROR_EVENTS (FAN_FS_ERROR) +#define FANOTIFY_MOUNT_EVENTS (FAN_MNT_ATTACH | FAN_MNT_DETACH) + /* Events that user can request to be notified on */ #define FANOTIFY_EVENTS (FANOTIFY_PATH_EVENTS | \ FANOTIFY_INODE_EVENTS | \ - FANOTIFY_ERROR_EVENTS) + FANOTIFY_ERROR_EVENTS | \ + FANOTIFY_MOUNT_EVENTS) /* Extra flags that may be reported with event or control handling of events */ #define FANOTIFY_EVENT_FLAGS (FAN_EVENT_ON_CHILD | FAN_ONDIR) diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index bd81679797077..e710967c7c263 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -28,6 +28,8 @@ /* #define FAN_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ #define FAN_PRE_ACCESS 0x00100000 /* Pre-content access hook */ +#define FAN_MNT_ATTACH 0x01000000 /* Mount was attached */ +#define FAN_MNT_DETACH 0x02000000 /* Mount was detached */ #define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */ @@ -64,6 +66,7 @@ #define FAN_REPORT_NAME 0x00000800 /* Report events with name */ #define FAN_REPORT_TARGET_FID 0x00001000 /* Report dirent target id */ #define FAN_REPORT_FD_ERROR 0x00002000 /* event->fd can report error */ +#define FAN_REPORT_MNT 0x00004000 /* Report mount events */ /* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */ #define FAN_REPORT_DFID_NAME (FAN_REPORT_DIR_FID | FAN_REPORT_NAME) @@ -94,6 +97,7 @@ #define FAN_MARK_INODE 0x00000000 #define FAN_MARK_MOUNT 0x00000010 #define FAN_MARK_FILESYSTEM 0x00000100 +#define FAN_MARK_MNTNS 0x00000110 /* * Convenience macro - FAN_MARK_IGNORE requires FAN_MARK_IGNORED_SURV_MODIFY @@ -147,6 +151,7 @@ struct fanotify_event_metadata { #define FAN_EVENT_INFO_TYPE_PIDFD 4 #define FAN_EVENT_INFO_TYPE_ERROR 5 #define FAN_EVENT_INFO_TYPE_RANGE 6 +#define FAN_EVENT_INFO_TYPE_MNT 7 /* Special info types for FAN_RENAME */ #define FAN_EVENT_INFO_TYPE_OLD_DFID_NAME 10 @@ -200,6 +205,11 @@ struct fanotify_event_info_range { __u64 count; }; +struct fanotify_event_info_mnt { + struct fanotify_event_info_header hdr; + __u64 mnt_id; +}; + /* * User space may need to record additional information about its decision. * The extra information type records what kind of information is included. -- GitLab From bf630c40164162ba1d3933c2f5e3397d083e0948 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 29 Jan 2025 17:58:01 +0100 Subject: [PATCH 108/934] vfs: add notifications for mount attach and detach Add notifications for attaching and detaching mounts to fs/namespace.c Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20250129165803.72138-4-mszeredi@redhat.com Signed-off-by: Christian Brauner --- fs/mount.h | 20 +++++++++++++ fs/namespace.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++- fs/pnode.c | 4 ++- 3 files changed, 101 insertions(+), 2 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 5324a931b4034..946dc8b792d7d 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -5,6 +5,8 @@ #include #include +extern struct list_head notify_list; + struct mnt_namespace { struct ns_common ns; struct mount * root; @@ -80,6 +82,8 @@ struct mount { #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; + struct list_head to_notify; /* need to queue notification */ + struct mnt_namespace *prev_ns; /* previous namespace (NULL if none) */ #endif int mnt_id; /* mount identifier, reused */ u64 mnt_id_unique; /* mount ID unique until reboot */ @@ -182,4 +186,20 @@ static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns) return container_of(ns, struct mnt_namespace, ns); } +#ifdef CONFIG_FSNOTIFY +static inline void mnt_notify_add(struct mount *m) +{ + /* Optimize the case where there are no watches */ + if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) || + (m->prev_ns && m->prev_ns->n_fsnotify_marks)) + list_add_tail(&m->to_notify, ¬ify_list); + else + m->prev_ns = m->mnt_ns; +} +#else +static inline void mnt_notify_add(struct mount *m) +{ +} +#endif + struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); diff --git a/fs/namespace.c b/fs/namespace.c index bc77b2e0130dc..05f6680e2251f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -81,6 +81,9 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static DEFINE_SEQLOCK(mnt_ns_tree_lock); +#ifdef CONFIG_FSNOTIFY +LIST_HEAD(notify_list); /* protected by namespace_sem */ +#endif static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ @@ -163,6 +166,7 @@ static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ if (refcount_dec_and_test(&ns->passive)) { + fsnotify_mntns_delete(ns); put_user_ns(ns->user_ns); kfree(ns); } @@ -1176,6 +1180,8 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) ns->mnt_first_node = &mnt->mnt_node; rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); + + mnt_notify_add(mnt); } /* @@ -1723,6 +1729,50 @@ int may_umount(struct vfsmount *mnt) EXPORT_SYMBOL(may_umount); +#ifdef CONFIG_FSNOTIFY +static void mnt_notify(struct mount *p) +{ + if (!p->prev_ns && p->mnt_ns) { + fsnotify_mnt_attach(p->mnt_ns, &p->mnt); + } else if (p->prev_ns && !p->mnt_ns) { + fsnotify_mnt_detach(p->prev_ns, &p->mnt); + } else if (p->prev_ns == p->mnt_ns) { + fsnotify_mnt_move(p->mnt_ns, &p->mnt); + } else { + fsnotify_mnt_detach(p->prev_ns, &p->mnt); + fsnotify_mnt_attach(p->mnt_ns, &p->mnt); + } + p->prev_ns = p->mnt_ns; +} + +static void notify_mnt_list(void) +{ + struct mount *m, *tmp; + /* + * Notify about mounts that were added/reparented/detached/remain + * connected after unmount. + */ + list_for_each_entry_safe(m, tmp, ¬ify_list, to_notify) { + mnt_notify(m); + list_del_init(&m->to_notify); + } +} + +static bool need_notify_mnt_list(void) +{ + return !list_empty(¬ify_list); +} +#else +static void notify_mnt_list(void) +{ +} + +static bool need_notify_mnt_list(void) +{ + return false; +} +#endif + static void namespace_unlock(void) { struct hlist_head head; @@ -1733,7 +1783,18 @@ static void namespace_unlock(void) hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); - up_write(&namespace_sem); + if (need_notify_mnt_list()) { + /* + * No point blocking out concurrent readers while notifications + * are sent. This will also allow statmount()/listmount() to run + * concurrently. + */ + downgrade_write(&namespace_sem); + notify_mnt_list(); + up_read(&namespace_sem); + } else { + up_write(&namespace_sem); + } shrink_dentry_list(&list); @@ -1846,6 +1907,19 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) change_mnt_propagation(p, MS_PRIVATE); if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); + + /* + * At this point p->mnt_ns is NULL, notification will be queued + * only if + * + * - p->prev_ns is non-NULL *and* + * - p->prev_ns->n_fsnotify_marks is non-NULL + * + * This will preclude queuing the mount if this is a cleanup + * after a failed copy_tree() or destruction of an anonymous + * namespace, etc. + */ + mnt_notify_add(p); } } @@ -2555,6 +2629,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, dest_mp = smp; unhash_mnt(source_mnt); attach_mnt(source_mnt, top_mnt, dest_mp, beneath); + mnt_notify_add(source_mnt); touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { @@ -4476,6 +4551,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, list_del_init(&new_mnt->mnt_expire); put_mountpoint(root_mp); unlock_mount_hash(); + mnt_notify_add(root_mnt); + mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); error = 0; out4: diff --git a/fs/pnode.c b/fs/pnode.c index ef048f008bdd5..82d809c785ec7 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -549,8 +549,10 @@ static void restore_mounts(struct list_head *to_restore) mp = parent->mnt_mp; parent = parent->mnt_parent; } - if (parent != mnt->mnt_parent) + if (parent != mnt->mnt_parent) { mnt_change_mountpoint(parent, mp, mnt); + mnt_notify_add(mnt); + } } } -- GitLab From 262b2fa99cbe02a715ce23981c2c30685ccf3a93 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 5 Feb 2025 19:17:47 +0100 Subject: [PATCH 109/934] pipe: introduce struct file_operations pipeanon_fops So that fifos and anonymous pipes could have different f_op methods. Preparation to simplify the next patch. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250205181747.GB13817@redhat.com Tested-by: K Prateek Nayak Signed-off-by: Christian Brauner --- fs/pipe.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 94b59045ab44b..2eacfde61e745 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -878,6 +878,8 @@ static const struct dentry_operations pipefs_dentry_operations = { .d_dname = pipefs_dname, }; +static const struct file_operations pipeanon_fops; + static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); @@ -895,7 +897,7 @@ static struct inode * get_pipe_inode(void) inode->i_pipe = pipe; pipe->files = 2; pipe->readers = pipe->writers = 1; - inode->i_fop = &pipefifo_fops; + inode->i_fop = &pipeanon_fops; /* * Mark the inode dirty from the very beginning, @@ -938,7 +940,7 @@ int create_pipe_files(struct file **res, int flags) f = alloc_file_pseudo(inode, pipe_mnt, "", O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), - &pipefifo_fops); + &pipeanon_fops); if (IS_ERR(f)) { free_pipe_info(inode->i_pipe); iput(inode); @@ -949,7 +951,7 @@ int create_pipe_files(struct file **res, int flags) f->f_pipe = 0; res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), - &pipefifo_fops); + &pipeanon_fops); if (IS_ERR(res[0])) { put_pipe_info(inode, inode->i_pipe); fput(f); @@ -1107,8 +1109,8 @@ static void wake_up_partner(struct pipe_inode_info *pipe) static int fifo_open(struct inode *inode, struct file *filp) { + bool is_pipe = inode->i_fop == &pipeanon_fops; struct pipe_inode_info *pipe; - bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; int ret; filp->f_pipe = 0; @@ -1241,6 +1243,17 @@ const struct file_operations pipefifo_fops = { .splice_write = iter_file_splice_write, }; +static const struct file_operations pipeanon_fops = { + .open = fifo_open, + .read_iter = pipe_read, + .write_iter = pipe_write, + .poll = pipe_poll, + .unlocked_ioctl = pipe_ioctl, + .release = pipe_release, + .fasync = pipe_fasync, + .splice_write = iter_file_splice_write, +}; + /* * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. @@ -1388,7 +1401,9 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { struct pipe_inode_info *pipe = file->private_data; - if (file->f_op != &pipefifo_fops || !pipe) + if (!pipe) + return NULL; + if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops) return NULL; if (for_splice && pipe_has_watch_queue(pipe)) return NULL; -- GitLab From f017b0a4951fac8f150232661b2cc0b67e0c57f0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 5 Feb 2025 19:18:12 +0100 Subject: [PATCH 110/934] pipe: don't update {a,c,m}time for anonymous pipes These numbers are visible in fstat() but hopefully nobody uses this information and file_accessed/file_update_time are not that cheap. Stupid test-case: #include #include #include #include #include #include static char buf[17 * 4096]; static struct timeval TW, TR; int wr(int fd, int size) { int c, r; struct timeval t0, t1; gettimeofday(&t0, NULL); for (c = 0; (r = write(fd, buf, size)) > 0; c += r); gettimeofday(&t1, NULL); timeradd(&TW, &t1, &TW); timersub(&TW, &t0, &TW); return c; } int rd(int fd, int size) { int c, r; struct timeval t0, t1; gettimeofday(&t0, NULL); for (c = 0; (r = read(fd, buf, size)) > 0; c += r); gettimeofday(&t1, NULL); timeradd(&TR, &t1, &TR); timersub(&TR, &t0, &TR); return c; } int main(int argc, const char *argv[]) { int fd[2], nb = 1, loop, size; assert(argc == 3); loop = atoi(argv[1]); size = atoi(argv[2]); assert(pipe(fd) == 0); assert(ioctl(fd[0], FIONBIO, &nb) == 0); assert(ioctl(fd[1], FIONBIO, &nb) == 0); assert(size <= sizeof(buf)); while (loop--) assert(wr(fd[1], size) == rd(fd[0], size)); struct timeval tt; timeradd(&TW, &TR, &tt); printf("TW = %lu.%03lu TR = %lu.%03lu TT = %lu.%03lu\n", TW.tv_sec, TW.tv_usec/1000, TR.tv_sec, TR.tv_usec/1000, tt.tv_sec, tt.tv_usec/1000); return 0; } Before: # for i in 1 2 3; do /host/tmp/test 10000 100; done TW = 8.047 TR = 5.845 TT = 13.893 TW = 8.091 TR = 5.872 TT = 13.963 TW = 8.083 TR = 5.885 TT = 13.969 After: # for i in 1 2 3; do /host/tmp/test 10000 100; done TW = 4.752 TR = 4.664 TT = 9.416 TW = 4.684 TR = 4.608 TT = 9.293 TW = 4.736 TR = 4.652 TT = 9.388 Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250205181812.GC13817@redhat.com Tested-by: K Prateek Nayak Signed-off-by: Christian Brauner --- fs/pipe.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 2eacfde61e745..2ae75adfba64b 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -248,7 +248,7 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, } static ssize_t -pipe_read(struct kiocb *iocb, struct iov_iter *to) +anon_pipe_read(struct kiocb *iocb, struct iov_iter *to) { size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; @@ -404,8 +404,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + return ret; +} + +static ssize_t +fifo_pipe_read(struct kiocb *iocb, struct iov_iter *to) +{ + int ret = anon_pipe_read(iocb, to); if (ret > 0) - file_accessed(filp); + file_accessed(iocb->ki_filp); return ret; } @@ -426,7 +433,7 @@ static inline bool pipe_writable(const struct pipe_inode_info *pipe) } static ssize_t -pipe_write(struct kiocb *iocb, struct iov_iter *from) +anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; @@ -604,11 +611,21 @@ out: kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { - int err = file_update_time(filp); - if (err) - ret = err; - sb_end_write(file_inode(filp)->i_sb); + return ret; +} + +static ssize_t +fifo_pipe_write(struct kiocb *iocb, struct iov_iter *from) +{ + int ret = anon_pipe_write(iocb, from); + if (ret > 0) { + struct file *filp = iocb->ki_filp; + if (sb_start_write_trylock(file_inode(filp)->i_sb)) { + int err = file_update_time(filp); + if (err) + ret = err; + sb_end_write(file_inode(filp)->i_sb); + } } return ret; } @@ -1234,8 +1251,8 @@ err: const struct file_operations pipefifo_fops = { .open = fifo_open, - .read_iter = pipe_read, - .write_iter = pipe_write, + .read_iter = fifo_pipe_read, + .write_iter = fifo_pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, @@ -1245,8 +1262,8 @@ const struct file_operations pipefifo_fops = { static const struct file_operations pipeanon_fops = { .open = fifo_open, - .read_iter = pipe_read, - .write_iter = pipe_write, + .read_iter = anon_pipe_read, + .write_iter = anon_pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, -- GitLab From f584714cffb9f7b3f1ae1d27bd57f099642fcfe8 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 5 Feb 2025 15:34:29 -0600 Subject: [PATCH 111/934] pstore: convert to the new mount API Convert the pstore filesystem to the new mount API. Cc: Kees Cook Cc: Tony Luck Signed-off-by: Eric Sandeen Link: https://lore.kernel.org/r/20250205213931.74614-2-sandeen@redhat.com Reviewed-by: Kees Cook Signed-off-by: Christian Brauner --- fs/pstore/inode.c | 109 +++++++++++++++++++++++++++++++--------------- 1 file changed, 74 insertions(+), 35 deletions(-) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 56815799ce798..0341438694219 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -14,10 +14,10 @@ #include #include #include -#include #include #include -#include +#include +#include #include #include #include @@ -226,37 +226,38 @@ static struct inode *pstore_get_inode(struct super_block *sb) } enum { - Opt_kmsg_bytes, Opt_err + Opt_kmsg_bytes }; -static const match_table_t tokens = { - {Opt_kmsg_bytes, "kmsg_bytes=%u"}, - {Opt_err, NULL} +static const struct fs_parameter_spec pstore_param_spec[] = { + fsparam_u32 ("kmsg_bytes", Opt_kmsg_bytes), + {} }; -static void parse_options(char *options) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - if (!options) - return; +struct pstore_context { + unsigned int kmsg_bytes; +}; - while ((p = strsep(&options, ",")) != NULL) { - int token; +static int pstore_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct pstore_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; - if (!*p) - continue; + opt = fs_parse(fc, pstore_param_spec, param, &result); + /* pstore has historically ignored invalid kmsg_bytes param */ + if (opt < 0) + return 0; - token = match_token(p, tokens, args); - switch (token) { - case Opt_kmsg_bytes: - if (!match_int(&args[0], &option)) - pstore_set_kmsg_bytes(option); - break; - } + switch (opt) { + case Opt_kmsg_bytes: + ctx->kmsg_bytes = result.uint_32; + break; + default: + return -EINVAL; } + + return 0; } /* @@ -269,10 +270,12 @@ static int pstore_show_options(struct seq_file *m, struct dentry *root) return 0; } -static int pstore_remount(struct super_block *sb, int *flags, char *data) +static int pstore_reconfigure(struct fs_context *fc) { - sync_filesystem(sb); - parse_options(data); + struct pstore_context *ctx = fc->fs_private; + + sync_filesystem(fc->root->d_sb); + pstore_set_kmsg_bytes(ctx->kmsg_bytes); return 0; } @@ -281,7 +284,6 @@ static const struct super_operations pstore_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .evict_inode = pstore_evict_inode, - .remount_fs = pstore_remount, .show_options = pstore_show_options, }; @@ -406,8 +408,9 @@ void pstore_get_records(int quiet) inode_unlock(d_inode(root)); } -static int pstore_fill_super(struct super_block *sb, void *data, int silent) +static int pstore_fill_super(struct super_block *sb, struct fs_context *fc) { + struct pstore_context *ctx = fc->fs_private; struct inode *inode; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -417,7 +420,7 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &pstore_ops; sb->s_time_gran = 1; - parse_options(data); + pstore_set_kmsg_bytes(ctx->kmsg_bytes); inode = pstore_get_inode(sb); if (inode) { @@ -438,12 +441,26 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct dentry *pstore_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int pstore_get_tree(struct fs_context *fc) +{ + if (fc->root) + return pstore_reconfigure(fc); + + return get_tree_single(fc, pstore_fill_super); +} + +static void pstore_free_fc(struct fs_context *fc) { - return mount_single(fs_type, flags, data, pstore_fill_super); + kfree(fc->fs_private); } +static const struct fs_context_operations pstore_context_ops = { + .parse_param = pstore_parse_param, + .get_tree = pstore_get_tree, + .reconfigure = pstore_reconfigure, + .free = pstore_free_fc, +}; + static void pstore_kill_sb(struct super_block *sb) { guard(mutex)(&pstore_sb_lock); @@ -456,11 +473,33 @@ static void pstore_kill_sb(struct super_block *sb) INIT_LIST_HEAD(&records_list); } +static int pstore_init_fs_context(struct fs_context *fc) +{ + struct pstore_context *ctx; + + ctx = kzalloc(sizeof(struct pstore_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + /* + * Global kmsg_bytes is initialized to default, and updated + * every time we (re)mount the single-sb filesystem with the + * option specified. + */ + ctx->kmsg_bytes = kmsg_bytes; + + fc->fs_private = ctx; + fc->ops = &pstore_context_ops; + + return 0; +} + static struct file_system_type pstore_fs_type = { .owner = THIS_MODULE, .name = "pstore", - .mount = pstore_mount, .kill_sb = pstore_kill_sb, + .init_fs_context = pstore_init_fs_context, + .parameters = pstore_param_spec, }; int __init pstore_init_fs(void) -- GitLab From cc0876f817d6d1636795e97c20c3b2b1e177718c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Feb 2025 15:34:30 -0600 Subject: [PATCH 112/934] vfs: Convert devpts to use the new mount API Convert the devpts filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.txt for more information. Cc: "Eric W. Biederman" Signed-off-by: David Howells [sandeen: forward port, keep pr_err vs errorf] Co-developed-by: Eric Sandeen Signed-off-by: Eric Sandeen Link: https://lore.kernel.org/r/20250205213931.74614-3-sandeen@redhat.com Signed-off-by: Christian Brauner --- fs/devpts/inode.c | 251 ++++++++++++++++++++-------------------------- 1 file changed, 110 insertions(+), 141 deletions(-) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 1096ff8562fa1..42e4d6eeb29f5 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -21,7 +23,6 @@ #include #include #include -#include #include #include @@ -87,14 +88,14 @@ enum { Opt_err }; -static const match_table_t tokens = { - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_mode, "mode=%o"}, - {Opt_ptmxmode, "ptmxmode=%o"}, - {Opt_newinstance, "newinstance"}, - {Opt_max, "max=%d"}, - {Opt_err, NULL} +static const struct fs_parameter_spec devpts_param_specs[] = { + fsparam_u32 ("gid", Opt_gid), + fsparam_s32 ("max", Opt_max), + fsparam_u32oct ("mode", Opt_mode), + fsparam_flag ("newinstance", Opt_newinstance), + fsparam_u32oct ("ptmxmode", Opt_ptmxmode), + fsparam_u32 ("uid", Opt_uid), + {} }; struct pts_fs_info { @@ -214,93 +215,48 @@ void devpts_release(struct pts_fs_info *fsi) deactivate_super(fsi->sb); } -#define PARSE_MOUNT 0 -#define PARSE_REMOUNT 1 - /* - * parse_mount_options(): - * Set @opts to mount options specified in @data. If an option is not - * specified in @data, set it to its default value. - * - * Note: @data may be NULL (in which case all options are set to default). + * devpts_parse_param - Parse mount parameters */ -static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) +static int devpts_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - kuid_t uid; - kgid_t gid; - - opts->setuid = 0; - opts->setgid = 0; - opts->uid = GLOBAL_ROOT_UID; - opts->gid = GLOBAL_ROOT_GID; - opts->mode = DEVPTS_DEFAULT_MODE; - opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; - opts->max = NR_UNIX98_PTY_MAX; - - /* Only allow instances mounted from the initial mount - * namespace to tap the reserve pool of ptys. - */ - if (op == PARSE_MOUNT) - opts->reserve = - (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns); - - while ((p = strsep(&data, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - int option; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - return -EINVAL; - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) - return -EINVAL; - opts->uid = uid; - opts->setuid = 1; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return -EINVAL; - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) - return -EINVAL; - opts->gid = gid; - opts->setgid = 1; - break; - case Opt_mode: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->mode = option & S_IALLUGO; - break; - case Opt_ptmxmode: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->ptmxmode = option & S_IALLUGO; - break; - case Opt_newinstance: - break; - case Opt_max: - if (match_int(&args[0], &option) || - option < 0 || option > NR_UNIX98_PTY_MAX) - return -EINVAL; - opts->max = option; - break; - default: - pr_err("called with bogus options\n"); - return -EINVAL; - } + struct pts_fs_info *fsi = fc->s_fs_info; + struct pts_mount_opts *opts = &fsi->mount_opts; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, devpts_param_specs, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + opts->uid = result.uid; + opts->setuid = 1; + break; + case Opt_gid: + opts->gid = result.gid; + opts->setgid = 1; + break; + case Opt_mode: + opts->mode = result.uint_32 & S_IALLUGO; + break; + case Opt_ptmxmode: + opts->ptmxmode = result.uint_32 & S_IALLUGO; + break; + case Opt_newinstance: + break; + case Opt_max: + if (result.uint_32 > NR_UNIX98_PTY_MAX) + return invalf(fc, "max out of range"); + opts->max = result.uint_32; + break; } return 0; } -static int mknod_ptmx(struct super_block *sb) +static int mknod_ptmx(struct super_block *sb, struct fs_context *fc) { int mode; int rc = -ENOMEM; @@ -362,13 +318,23 @@ static void update_ptmx_mode(struct pts_fs_info *fsi) } } -static int devpts_remount(struct super_block *sb, int *flags, char *data) +static int devpts_reconfigure(struct fs_context *fc) { - int err; - struct pts_fs_info *fsi = DEVPTS_SB(sb); - struct pts_mount_opts *opts = &fsi->mount_opts; + struct pts_fs_info *fsi = DEVPTS_SB(fc->root->d_sb); + struct pts_fs_info *new = fc->s_fs_info; - err = parse_mount_options(data, PARSE_REMOUNT, opts); + /* Apply the revised options. We don't want to change ->reserve. + * Ideally, we'd update each option conditionally on it having been + * explicitly changed, but the default is to reset everything so that + * would break UAPI... + */ + fsi->mount_opts.setuid = new->mount_opts.setuid; + fsi->mount_opts.setgid = new->mount_opts.setgid; + fsi->mount_opts.uid = new->mount_opts.uid; + fsi->mount_opts.gid = new->mount_opts.gid; + fsi->mount_opts.mode = new->mount_opts.mode; + fsi->mount_opts.ptmxmode = new->mount_opts.ptmxmode; + fsi->mount_opts.max = new->mount_opts.max; /* * parse_mount_options() restores options to default values @@ -378,7 +344,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) */ update_ptmx_mode(fsi); - return err; + return 0; } static int devpts_show_options(struct seq_file *seq, struct dentry *root) @@ -402,31 +368,13 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations devpts_sops = { .statfs = simple_statfs, - .remount_fs = devpts_remount, .show_options = devpts_show_options, }; -static void *new_pts_fs_info(struct super_block *sb) -{ - struct pts_fs_info *fsi; - - fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL); - if (!fsi) - return NULL; - - ida_init(&fsi->allocated_ptys); - fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; - fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; - fsi->sb = sb; - - return fsi; -} - -static int -devpts_fill_super(struct super_block *s, void *data, int silent) +static int devpts_fill_super(struct super_block *s, struct fs_context *fc) { + struct pts_fs_info *fsi = DEVPTS_SB(s); struct inode *inode; - int error; s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; @@ -435,20 +383,11 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_d_op = &simple_dentry_operations; s->s_time_gran = 1; + fsi->sb = s; - error = -ENOMEM; - s->s_fs_info = new_pts_fs_info(s); - if (!s->s_fs_info) - goto fail; - - error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts); - if (error) - goto fail; - - error = -ENOMEM; inode = new_inode(s); if (!inode) - goto fail; + return -ENOMEM; inode->i_ino = 1; simple_inode_init_ts(inode); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; @@ -459,31 +398,60 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_root = d_make_root(inode); if (!s->s_root) { pr_err("get root dentry failed\n"); - goto fail; + return -ENOMEM; } - error = mknod_ptmx(s); - if (error) - goto fail_dput; - - return 0; -fail_dput: - dput(s->s_root); - s->s_root = NULL; -fail: - return error; + return mknod_ptmx(s, fc); } /* - * devpts_mount() + * devpts_get_tree() * * Mount a new (private) instance of devpts. PTYs created in this * instance are independent of the PTYs in other devpts instances. */ -static struct dentry *devpts_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int devpts_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, devpts_fill_super); +} + +static void devpts_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations devpts_context_ops = { + .free = devpts_free_fc, + .parse_param = devpts_parse_param, + .get_tree = devpts_get_tree, + .reconfigure = devpts_reconfigure, +}; + +/* + * Set up the filesystem mount context. + */ +static int devpts_init_fs_context(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, devpts_fill_super); + struct pts_fs_info *fsi; + + fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL); + if (!fsi) + return -ENOMEM; + + ida_init(&fsi->allocated_ptys); + fsi->mount_opts.uid = GLOBAL_ROOT_UID; + fsi->mount_opts.gid = GLOBAL_ROOT_GID; + fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; + fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + fsi->mount_opts.max = NR_UNIX98_PTY_MAX; + + if (fc->purpose == FS_CONTEXT_FOR_MOUNT && + current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns) + fsi->mount_opts.reserve = true; + + fc->s_fs_info = fsi; + fc->ops = &devpts_context_ops; + return 0; } static void devpts_kill_sb(struct super_block *sb) @@ -498,7 +466,8 @@ static void devpts_kill_sb(struct super_block *sb) static struct file_system_type devpts_fs_type = { .name = "devpts", - .mount = devpts_mount, + .init_fs_context = devpts_init_fs_context, + .parameters = devpts_param_specs, .kill_sb = devpts_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; -- GitLab From cb0e0a8bf4e1e1c8cd6d11ccaa355a23e1853566 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 5 Feb 2025 15:34:31 -0600 Subject: [PATCH 113/934] devtmpfs: replace ->mount with ->get_tree in public instance To finalize mount API conversion, remove the ->mount op from the public instance in favor of ->get_tree etc. Copy most ops from the underlying ops vector (whether it's shmem or ramfs) and substitute our own ->get_tree which simply takes an extra reference on the existing internal mount as before. Thanks to Al for the fs_context_for_reconfigure() idea. Cc: Al Viro Cc: Neil Brown Signed-off-by: Eric Sandeen Link: https://lore.kernel.org/r/20250205213931.74614-4-sandeen@redhat.com Signed-off-by: Christian Brauner --- drivers/base/devtmpfs.c | 81 ++++++++++++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 17 deletions(-) diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index b848764ef0187..03a7c7902fcd2 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -63,22 +63,6 @@ __setup("devtmpfs.mount=", mount_param); static struct vfsmount *mnt; -static struct dentry *public_dev_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) -{ - struct super_block *s = mnt->mnt_sb; - int err; - - atomic_inc(&s->s_active); - down_write(&s->s_umount); - err = reconfigure_single(s, flags, data); - if (err < 0) { - deactivate_locked_super(s); - return ERR_PTR(err); - } - return dget(s->s_root); -} - static struct file_system_type internal_fs_type = { .name = "devtmpfs", #ifdef CONFIG_TMPFS @@ -89,9 +73,40 @@ static struct file_system_type internal_fs_type = { .kill_sb = kill_litter_super, }; +/* Simply take a ref on the existing mount */ +static int devtmpfs_get_tree(struct fs_context *fc) +{ + struct super_block *sb = mnt->mnt_sb; + + atomic_inc(&sb->s_active); + down_write(&sb->s_umount); + fc->root = dget(sb->s_root); + return 0; +} + +/* Ops are filled in during init depending on underlying shmem or ramfs type */ +struct fs_context_operations devtmpfs_context_ops = {}; + +/* Call the underlying initialization and set to our ops */ +static int devtmpfs_init_fs_context(struct fs_context *fc) +{ + int ret; +#ifdef CONFIG_TMPFS + ret = shmem_init_fs_context(fc); +#else + ret = ramfs_init_fs_context(fc); +#endif + if (ret < 0) + return ret; + + fc->ops = &devtmpfs_context_ops; + + return 0; +} + static struct file_system_type dev_fs_type = { .name = "devtmpfs", - .mount = public_dev_mount, + .init_fs_context = devtmpfs_init_fs_context, }; static int devtmpfs_submit_req(struct req *req, const char *tmp) @@ -442,6 +457,31 @@ static int __ref devtmpfsd(void *p) return 0; } +/* + * Get the underlying (shmem/ramfs) context ops to build ours + */ +static int devtmpfs_configure_context(void) +{ + struct fs_context *fc; + + fc = fs_context_for_reconfigure(mnt->mnt_root, mnt->mnt_sb->s_flags, + MS_RMT_MASK); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + /* Set up devtmpfs_context_ops based on underlying type */ + devtmpfs_context_ops.free = fc->ops->free; + devtmpfs_context_ops.dup = fc->ops->dup; + devtmpfs_context_ops.parse_param = fc->ops->parse_param; + devtmpfs_context_ops.parse_monolithic = fc->ops->parse_monolithic; + devtmpfs_context_ops.get_tree = &devtmpfs_get_tree; + devtmpfs_context_ops.reconfigure = fc->ops->reconfigure; + + put_fs_context(fc); + + return 0; +} + /* * Create devtmpfs instance, driver-core devices will add their device * nodes here. @@ -456,6 +496,13 @@ int __init devtmpfs_init(void) pr_err("unable to create devtmpfs %ld\n", PTR_ERR(mnt)); return PTR_ERR(mnt); } + + err = devtmpfs_configure_context(); + if (err) { + pr_err("unable to configure devtmpfs type %d\n", err); + return err; + } + err = register_filesystem(&dev_fs_type); if (err) { pr_err("unable to register devtmpfs type %d\n", err); -- GitLab From bdfa77e7c6bfda57d28fba24a5195fca2392c08a Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 5 Feb 2025 15:34:32 -0600 Subject: [PATCH 114/934] vfs: remove some unused old mount api code Remove reconfigure_single, mount_single, and compare_single now that no users remain. Signed-off-by: Eric Sandeen Link: https://lore.kernel.org/r/20250205213931.74614-5-sandeen@redhat.com Signed-off-by: Christian Brauner --- fs/super.c | 55 -------------------------------------- include/linux/fs.h | 3 --- include/linux/fs_context.h | 2 -- 3 files changed, 60 deletions(-) diff --git a/fs/super.c b/fs/super.c index 5a7db4a556e37..723176dee2299 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1737,61 +1737,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_nodev); -int reconfigure_single(struct super_block *s, - int flags, void *data) -{ - struct fs_context *fc; - int ret; - - /* The caller really need to be passing fc down into mount_single(), - * then a chunk of this can be removed. [Bollocks -- AV] - * Better yet, reconfiguration shouldn't happen, but rather the second - * mount should be rejected if the parameters are not compatible. - */ - fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); - if (IS_ERR(fc)) - return PTR_ERR(fc); - - ret = parse_monolithic_mount_data(fc, data); - if (ret < 0) - goto out; - - ret = reconfigure_super(fc); -out: - put_fs_context(fc); - return ret; -} - -static int compare_single(struct super_block *s, void *p) -{ - return 1; -} - -struct dentry *mount_single(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - struct super_block *s; - int error; - - s = sget(fs_type, compare_single, set_anon_super, flags, NULL); - if (IS_ERR(s)) - return ERR_CAST(s); - if (!s->s_root) { - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (!error) - s->s_flags |= SB_ACTIVE; - } else { - error = reconfigure_single(s, flags, data); - } - if (unlikely(error)) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_single); - /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. diff --git a/include/linux/fs.h b/include/linux/fs.h index be3ad155ec9f7..ff5e8ab9f951f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2641,9 +2641,6 @@ static inline bool is_mgtime(const struct inode *inode) extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); -extern struct dentry *mount_single(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 4b4bfef6f053a..a19e4bd32e4d3 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -144,8 +144,6 @@ extern void put_fs_context(struct fs_context *fc); extern int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param); extern void fc_drop_locked(struct fs_context *fc); -int reconfigure_single(struct super_block *s, - int flags, void *data); extern int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, -- GitLab From c50105933f0c75aacc4f95c9bf36f7fbd9a83884 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:39:59 +0100 Subject: [PATCH 115/934] iomap: allow the file system to submit the writeback bios Change ->prepare_ioend to ->submit_ioend and require file systems that implement it to submit the bio. This is needed for file systems that do their own work on the bios before submitting them to the block layer like btrfs or zoned xfs. To make this easier also pass the writeback context to the method. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-2-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/operations.rst | 11 +++++------ fs/iomap/buffered-io.c | 10 +++++----- fs/xfs/xfs_aops.c | 13 +++++++++---- include/linux/iomap.h | 12 +++++++----- 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 2c7f5df9d8b03..04fc7a49067dd 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -283,7 +283,7 @@ The ``ops`` structure must be specified and is as follows: struct iomap_writeback_ops { int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset, unsigned len); - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); void (*discard_folio)(struct folio *folio, loff_t pos); }; @@ -306,13 +306,12 @@ The fields are as follows: purpose. This function must be supplied by the filesystem. - - ``prepare_ioend``: Enables filesystems to transform the writeback - ioend or perform any other preparatory work before the writeback I/O - is submitted. + - ``submit_ioend``: Allows the file systems to hook into writeback bio + submission. This might include pre-write space accounting updates, or installing a custom ``->bi_end_io`` function for internal purposes, such as deferring the ioend completion to a workqueue to run metadata update - transactions from process context. + transactions from process context before submitting the bio. This function is optional. - ``discard_folio``: iomap calls this function after ``->map_blocks`` @@ -341,7 +340,7 @@ This can happen in interrupt or process context, depending on the storage device. Filesystems that need to update internal bookkeeping (e.g. unwritten -extent conversions) should provide a ``->prepare_ioend`` function to +extent conversions) should provide a ``->submit_ioend`` function to set ``struct iomap_end::bio::bi_end_io`` to its own function. This function should call ``iomap_finish_ioends`` after finishing its own work (e.g. unwritten extent conversion). diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d303e6c8900cd..7952bf004bdbb 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1675,7 +1675,7 @@ static void iomap_writepage_end_bio(struct bio *bio) } /* - * Submit the final bio for an ioend. + * Submit an ioend. * * If @error is non-zero, it means that we have a situation where some part of * the submission process has failed after we've marked pages for writeback. @@ -1694,14 +1694,14 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) * failure happened so that the file system end I/O handler gets called * to clean up. */ - if (wpc->ops->prepare_ioend) - error = wpc->ops->prepare_ioend(wpc->ioend, error); + if (wpc->ops->submit_ioend) + error = wpc->ops->submit_ioend(wpc, error); + else if (!error) + submit_bio(&wpc->ioend->io_bio); if (error) { wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); bio_endio(&wpc->ioend->io_bio); - } else { - submit_bio(&wpc->ioend->io_bio); } wpc->ioend = NULL; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 67877c36ed11a..aa88895673d8f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -395,10 +395,11 @@ allocate_blocks: } static int -xfs_prepare_ioend( - struct iomap_ioend *ioend, +xfs_submit_ioend( + struct iomap_writepage_ctx *wpc, int status) { + struct iomap_ioend *ioend = wpc->ioend; unsigned int nofs_flag; /* @@ -420,7 +421,11 @@ xfs_prepare_ioend( if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || (ioend->io_flags & IOMAP_F_SHARED)) ioend->io_bio.bi_end_io = xfs_end_bio; - return status; + + if (status) + return status; + submit_bio(&ioend->io_bio); + return 0; } /* @@ -462,7 +467,7 @@ xfs_discard_folio( static const struct iomap_writeback_ops xfs_writeback_ops = { .map_blocks = xfs_map_blocks, - .prepare_ioend = xfs_prepare_ioend, + .submit_ioend = xfs_submit_ioend, .discard_folio = xfs_discard_folio, }; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 75bf54e76f3b8..dc8df4f779d4c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -362,12 +362,14 @@ struct iomap_writeback_ops { loff_t offset, unsigned len); /* - * Optional, allows the file systems to perform actions just before - * submitting the bio and/or override the bio end_io handler for complex - * operations like copy on write extent manipulation or unwritten extent - * conversions. + * Optional, allows the file systems to hook into bio submission, + * including overriding the bi_end_io handler. + * + * Returns 0 if the bio was successfully submitted, or a negative + * error code if status was non-zero or another error happened and + * the bio could not be submitted. */ - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); /* * Optional, allows the file system to discard state on a page where -- GitLab From 710273330663241d9ca5fbed51909e65807556ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:00 +0100 Subject: [PATCH 116/934] iomap: simplify io_flags and io_type in struct iomap_ioend The ioend fields for distinct types of I/O are a bit complicated. Consolidate them into a single io_flag field with it's own flags decoupled from the iomap flags. This also prepares for adding a new flag that is unrelated to both of the iomap namespaces. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-3-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 39 ++++++++++++++++++++++----------------- fs/xfs/xfs_aops.c | 12 ++++++------ include/linux/iomap.h | 20 ++++++++++++++++++-- 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7952bf004bdbb..d8d271107e60a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1605,13 +1605,10 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) { if (ioend->io_bio.bi_status != next->io_bio.bi_status) return false; - if (next->io_flags & IOMAP_F_BOUNDARY) + if (next->io_flags & IOMAP_IOEND_BOUNDARY) return false; - if ((ioend->io_flags & IOMAP_F_SHARED) ^ - (next->io_flags & IOMAP_F_SHARED)) - return false; - if ((ioend->io_type == IOMAP_UNWRITTEN) ^ - (next->io_type == IOMAP_UNWRITTEN)) + if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) return false; if (ioend->io_offset + ioend->io_size != next->io_offset) return false; @@ -1709,7 +1706,8 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) } static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct inode *inode, loff_t pos) + struct writeback_control *wbc, struct inode *inode, loff_t pos, + u16 ioend_flags) { struct iomap_ioend *ioend; struct bio *bio; @@ -1724,8 +1722,7 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, ioend = iomap_ioend_from_bio(bio); INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = wpc->iomap.type; - ioend->io_flags = wpc->iomap.flags; + ioend->io_flags = ioend_flags; if (pos > wpc->iomap.offset) wpc->iomap.flags &= ~IOMAP_F_BOUNDARY; ioend->io_inode = inode; @@ -1737,14 +1734,13 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, return ioend; } -static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, + u16 ioend_flags) { - if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) - return false; - if ((wpc->iomap.flags & IOMAP_F_SHARED) != - (wpc->ioend->io_flags & IOMAP_F_SHARED)) + if (ioend_flags & IOMAP_IOEND_BOUNDARY) return false; - if (wpc->iomap.type != wpc->ioend->io_type) + if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) return false; if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; @@ -1779,14 +1775,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, { struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); + unsigned int ioend_flags = 0; int error; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { + if (wpc->iomap.type == IOMAP_UNWRITTEN) + ioend_flags |= IOMAP_IOEND_UNWRITTEN; + if (wpc->iomap.flags & IOMAP_F_SHARED) + ioend_flags |= IOMAP_IOEND_SHARED; + if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + ioend_flags |= IOMAP_IOEND_BOUNDARY; + + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { new_ioend: error = iomap_submit_ioend(wpc, 0); if (error) return error; - wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, + ioend_flags); } if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index aa88895673d8f..8e60ceeb15204 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -114,7 +114,7 @@ xfs_end_ioend( */ error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { - if (ioend->io_flags & IOMAP_F_SHARED) { + if (ioend->io_flags & IOMAP_IOEND_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, offset + size); @@ -125,9 +125,9 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_flags & IOMAP_F_SHARED) + if (ioend->io_flags & IOMAP_IOEND_SHARED) error = xfs_reflink_end_cow(ip, offset, size); - else if (ioend->io_type == IOMAP_UNWRITTEN) + else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); if (!error && xfs_ioend_is_append(ioend)) @@ -410,7 +410,7 @@ xfs_submit_ioend( nofs_flag = memalloc_nofs_save(); /* Convert CoW extents to regular */ - if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { + if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); } @@ -418,8 +418,8 @@ xfs_submit_ioend( memalloc_nofs_restore(nofs_flag); /* send ioends that might require a transaction to the completion wq */ - if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || - (ioend->io_flags & IOMAP_F_SHARED)) + if (xfs_ioend_is_append(ioend) || + (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))) ioend->io_bio.bi_end_io = xfs_end_bio; if (status) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index dc8df4f779d4c..9583f64561658 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -327,13 +327,29 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset, sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops); +/* + * Flags for iomap_ioend->io_flags. + */ +/* shared COW extent */ +#define IOMAP_IOEND_SHARED (1U << 0) +/* unwritten extent */ +#define IOMAP_IOEND_UNWRITTEN (1U << 1) +/* don't merge into previous ioend */ +#define IOMAP_IOEND_BOUNDARY (1U << 2) + +/* + * Flags that if set on either ioend prevent the merge of two ioends. + * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) + */ +#define IOMAP_IOEND_NOMERGE_FLAGS \ + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN) + /* * Structure for writeback I/O completions. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ - u16 io_type; - u16 io_flags; /* IOMAP_F_* */ + u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of data within eof */ loff_t io_offset; /* offset in the file */ -- GitLab From 034c29fb3e7c119c42e650986e280f025a1bec7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:01 +0100 Subject: [PATCH 117/934] iomap: add a IOMAP_F_ANON_WRITE flag Add a IOMAP_F_ANON_WRITE flag that indicates that the write I/O does not have a target block assigned to it yet at iomap time and the file system will do that in the bio submission handler, splitting the I/O as needed. This is used to implement Zone Append based I/O for zoned XFS, where splitting writes to the hardware limits and assigning a zone to them happens just before sending the I/O off to the block layer, but could also be useful for other things like compressed I/O. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-4-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/design.rst | 4 ++++ fs/iomap/buffered-io.c | 13 +++++++++---- fs/iomap/direct-io.c | 6 ++++-- include/linux/iomap.h | 7 +++++++ 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst index b0d0188a095e5..28ab3758c474e 100644 --- a/Documentation/filesystems/iomap/design.rst +++ b/Documentation/filesystems/iomap/design.rst @@ -246,6 +246,10 @@ The fields are as follows: * **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can be set by the filesystem for its own purposes. + * **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target + block assigned to it yet and the file system will do that in the bio + submission handler, splitting the I/O as needed. + These flags can be set by iomap itself during file operations. The filesystem should supply an ``->iomap_end`` function if it needs to observe these flags: diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d8d271107e60a..ba795d72e546b 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1691,10 +1691,14 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) * failure happened so that the file system end I/O handler gets called * to clean up. */ - if (wpc->ops->submit_ioend) + if (wpc->ops->submit_ioend) { error = wpc->ops->submit_ioend(wpc, error); - else if (!error) - submit_bio(&wpc->ioend->io_bio); + } else { + if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) + error = -EIO; + if (!error) + submit_bio(&wpc->ioend->io_bio); + } if (error) { wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); @@ -1744,7 +1748,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, return false; if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; - if (iomap_sector(&wpc->iomap, pos) != + if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && + iomap_sector(&wpc->iomap, pos) != bio_end_sector(&wpc->ioend->io_bio)) return false; /* diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b521eb15759e8..641649a04614e 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -81,10 +81,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, WRITE_ONCE(iocb->private, bio); } - if (dio->dops && dio->dops->submit_io) + if (dio->dops && dio->dops->submit_io) { dio->dops->submit_io(iter, bio, pos); - else + } else { + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE); submit_bio(bio); + } } ssize_t iomap_dio_complete(struct iomap_dio *dio) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 9583f64561658..eb0764945b42c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -56,6 +56,10 @@ struct vm_fault; * * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must * never be merged with the mapping before it. + * + * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block + * assigned to it yet and the file system will do that in the bio submission + * handler, splitting the I/O as needed. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -68,6 +72,7 @@ struct vm_fault; #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) #define IOMAP_F_BOUNDARY (1U << 6) +#define IOMAP_F_ANON_WRITE (1U << 7) /* * Flags set by the core iomap code during operations: @@ -111,6 +116,8 @@ struct iomap { static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) { + if (iomap->flags & IOMAP_F_ANON_WRITE) + return U64_MAX; /* invalid */ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } -- GitLab From 5fcbd555d48390a8c819ba7fdf55fbfcabe05c80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:02 +0100 Subject: [PATCH 118/934] iomap: split bios to zone append limits in the submission handlers Provide helpers for file systems to split bios in the direct I/O and writeback I/O submission handlers. The split ioends are chained to the parent ioend so that only the parent ioend originally generated by the iomap layer will be processed after all the chained off children have completed. This is based on the block layer bio chaining that has supported a similar mechanism for a long time. This Follows btrfs' lead and don't try to build bios to hardware limits for zone append commands, but instead build them as normal unconstrained bios and split them to the hardware limits in the I/O submission handler. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-5-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/Makefile | 1 + fs/iomap/buffered-io.c | 49 ++++++++++++++---------- fs/iomap/ioend.c | 86 ++++++++++++++++++++++++++++++++++++++++++ include/linux/iomap.h | 15 +++++++- 4 files changed, 130 insertions(+), 21 deletions(-) create mode 100644 fs/iomap/ioend.c diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index 381d76c5c2325..69e8ebb41302a 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -12,6 +12,7 @@ iomap-y += trace.o \ iter.o iomap-$(CONFIG_BLOCK) += buffered-io.o \ direct-io.o \ + ioend.o \ fiemap.o \ seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index ba795d72e546b..f67e13a9807ae 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -40,7 +40,8 @@ struct iomap_folio_state { unsigned long state[]; }; -static struct bio_set iomap_ioend_bioset; +struct bio_set iomap_ioend_bioset; +EXPORT_SYMBOL_GPL(iomap_ioend_bioset); static inline bool ifs_is_fully_uptodate(struct folio *folio, struct iomap_folio_state *ifs) @@ -1539,15 +1540,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, * ioend after this. */ static u32 -iomap_finish_ioend(struct iomap_ioend *ioend, int error) +iomap_finish_ioend_buffered(struct iomap_ioend *ioend) { struct inode *inode = ioend->io_inode; struct bio *bio = &ioend->io_bio; struct folio_iter fi; u32 folio_count = 0; - if (error) { - mapping_set_error(inode->i_mapping, error); + if (ioend->io_error) { + mapping_set_error(inode->i_mapping, ioend->io_error); if (!bio_flagged(bio, BIO_QUIET)) { pr_err_ratelimited( "%s: writeback error on inode %lu, offset %lld, sector %llu", @@ -1566,6 +1567,24 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) return folio_count; } +static u32 +iomap_finish_ioend(struct iomap_ioend *ioend, int error) +{ + if (ioend->io_parent) { + struct bio *bio = &ioend->io_bio; + + ioend = ioend->io_parent; + bio_put(bio); + } + + if (error) + cmpxchg(&ioend->io_error, 0, error); + + if (!atomic_dec_and_test(&ioend->io_remaining)) + return 0; + return iomap_finish_ioend_buffered(ioend); +} + /* * Ioend completion routine for merged bios. This can only be called from task * contexts as merged ioends can be of unbound length. Hence we have to break up @@ -1667,8 +1686,10 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends); static void iomap_writepage_end_bio(struct bio *bio) { - iomap_finish_ioend(iomap_ioend_from_bio(bio), - blk_status_to_errno(bio->bi_status)); + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + ioend->io_error = blk_status_to_errno(bio->bi_status); + iomap_finish_ioend_buffered(ioend); } /* @@ -1713,7 +1734,6 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, loff_t pos, u16 ioend_flags) { - struct iomap_ioend *ioend; struct bio *bio; bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, @@ -1721,21 +1741,10 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); bio->bi_end_io = iomap_writepage_end_bio; - wbc_init_bio(wbc, bio); bio->bi_write_hint = inode->i_write_hint; - - ioend = iomap_ioend_from_bio(bio); - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_flags = ioend_flags; - if (pos > wpc->iomap.offset) - wpc->iomap.flags &= ~IOMAP_F_BOUNDARY; - ioend->io_inode = inode; - ioend->io_size = 0; - ioend->io_offset = pos; - ioend->io_sector = bio->bi_iter.bi_sector; - + wbc_init_bio(wbc, bio); wpc->nr_folios = 0; - return ioend; + return iomap_init_ioend(inode, bio, pos, ioend_flags); } static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c new file mode 100644 index 0000000000000..3ff38c665c313 --- /dev/null +++ b/fs/iomap/ioend.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024-2025 Christoph Hellwig. + */ +#include + +struct iomap_ioend *iomap_init_ioend(struct inode *inode, + struct bio *bio, loff_t file_offset, u16 ioend_flags) +{ + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + atomic_set(&ioend->io_remaining, 1); + ioend->io_error = 0; + ioend->io_parent = NULL; + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_flags = ioend_flags; + ioend->io_inode = inode; + ioend->io_offset = file_offset; + ioend->io_size = bio->bi_iter.bi_size; + ioend->io_sector = bio->bi_iter.bi_sector; + return ioend; +} +EXPORT_SYMBOL_GPL(iomap_init_ioend); + +/* + * Split up to the first @max_len bytes from @ioend if the ioend covers more + * than @max_len bytes. + * + * If @is_append is set, the split will be based on the hardware limits for + * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware + * limits don't allow the entire @max_len length. + * + * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer + * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to + * switch the operation after this call, but before submitting the bio. + */ +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append) +{ + struct bio *bio = &ioend->io_bio; + struct iomap_ioend *split_ioend; + unsigned int nr_segs; + int sector_offset; + struct bio *split; + + if (is_append) { + struct queue_limits *lim = bdev_limits(bio->bi_bdev); + + max_len = min(max_len, + lim->max_zone_append_sectors << SECTOR_SHIFT); + + sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len); + if (unlikely(sector_offset < 0)) + return ERR_PTR(sector_offset); + if (!sector_offset) + return NULL; + } else { + if (bio->bi_iter.bi_size <= max_len) + return NULL; + sector_offset = max_len >> SECTOR_SHIFT; + } + + /* ensure the split ioend is still block size aligned */ + sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT, + i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT; + + split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset); + if (IS_ERR(split)) + return ERR_CAST(split); + split->bi_private = bio->bi_private; + split->bi_end_io = bio->bi_end_io; + + split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset, + ioend->io_flags); + split_ioend->io_parent = ioend; + + atomic_inc(&ioend->io_remaining); + ioend->io_offset += split_ioend->io_size; + ioend->io_size -= split_ioend->io_size; + + split_ioend->io_sector = ioend->io_sector; + if (!is_append) + ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT); + return split_ioend; +} +EXPORT_SYMBOL_GPL(iomap_split_ioend); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eb0764945b42c..90c27875e39dc 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -353,12 +353,19 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, /* * Structure for writeback I/O completions. + * + * File systems implementing ->submit_ioend can split a bio generated + * by iomap. In that case the parent ioend it was split from is recorded + * in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of data within eof */ + size_t io_size; /* size of the extent */ + atomic_t io_remaining; /* completetion defer count */ + int io_error; /* stashed away status */ + struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ struct bio io_bio; /* MUST BE LAST! */ @@ -408,6 +415,10 @@ struct iomap_writepage_ctx { u32 nr_folios; /* folios added to the ioend */ }; +struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio, + loff_t file_offset, u16 ioend_flags); +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append); void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends); @@ -479,4 +490,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, # define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) #endif /* CONFIG_SWAP */ +extern struct bio_set iomap_ioend_bioset; + #endif /* LINUX_IOMAP_H */ -- GitLab From 63b66913d11c5f3572dfdee38e78d510d0f90aa8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:03 +0100 Subject: [PATCH 119/934] iomap: move common ioend code to ioend.c This code will be reused for direct I/O soon, so split it out of buffered-io.c. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-6-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 135 +---------------------------------------- fs/iomap/internal.h | 9 +++ fs/iomap/ioend.c | 127 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 138 insertions(+), 133 deletions(-) create mode 100644 fs/iomap/internal.h diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f67e13a9807ae..4abff64998feb 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -12,17 +12,15 @@ #include #include #include -#include #include #include #include #include +#include "internal.h" #include "trace.h" #include "../internal.h" -#define IOEND_BATCH_SIZE 4096 - /* * Structure allocated for each folio to track per-block uptodate, dirty state * and I/O completions. @@ -40,9 +38,6 @@ struct iomap_folio_state { unsigned long state[]; }; -struct bio_set iomap_ioend_bioset; -EXPORT_SYMBOL_GPL(iomap_ioend_bioset); - static inline bool ifs_is_fully_uptodate(struct folio *folio, struct iomap_folio_state *ifs) { @@ -1539,8 +1534,7 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, * state, release holds on bios, and finally free up memory. Do not use the * ioend after this. */ -static u32 -iomap_finish_ioend_buffered(struct iomap_ioend *ioend) +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) { struct inode *inode = ioend->io_inode; struct bio *bio = &ioend->io_bio; @@ -1567,123 +1561,6 @@ iomap_finish_ioend_buffered(struct iomap_ioend *ioend) return folio_count; } -static u32 -iomap_finish_ioend(struct iomap_ioend *ioend, int error) -{ - if (ioend->io_parent) { - struct bio *bio = &ioend->io_bio; - - ioend = ioend->io_parent; - bio_put(bio); - } - - if (error) - cmpxchg(&ioend->io_error, 0, error); - - if (!atomic_dec_and_test(&ioend->io_remaining)) - return 0; - return iomap_finish_ioend_buffered(ioend); -} - -/* - * Ioend completion routine for merged bios. This can only be called from task - * contexts as merged ioends can be of unbound length. Hence we have to break up - * the writeback completions into manageable chunks to avoid long scheduler - * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get - * good batch processing throughput without creating adverse scheduler latency - * conditions. - */ -void -iomap_finish_ioends(struct iomap_ioend *ioend, int error) -{ - struct list_head tmp; - u32 completions; - - might_sleep(); - - list_replace_init(&ioend->io_list, &tmp); - completions = iomap_finish_ioend(ioend, error); - - while (!list_empty(&tmp)) { - if (completions > IOEND_BATCH_SIZE * 8) { - cond_resched(); - completions = 0; - } - ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); - list_del_init(&ioend->io_list); - completions += iomap_finish_ioend(ioend, error); - } -} -EXPORT_SYMBOL_GPL(iomap_finish_ioends); - -/* - * We can merge two adjacent ioends if they have the same set of work to do. - */ -static bool -iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) -{ - if (ioend->io_bio.bi_status != next->io_bio.bi_status) - return false; - if (next->io_flags & IOMAP_IOEND_BOUNDARY) - return false; - if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != - (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) - return false; - if (ioend->io_offset + ioend->io_size != next->io_offset) - return false; - /* - * Do not merge physically discontiguous ioends. The filesystem - * completion functions will have to iterate the physical - * discontiguities even if we merge the ioends at a logical level, so - * we don't gain anything by merging physical discontiguities here. - * - * We cannot use bio->bi_iter.bi_sector here as it is modified during - * submission so does not point to the start sector of the bio at - * completion. - */ - if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) - return false; - return true; -} - -void -iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends) -{ - struct iomap_ioend *next; - - INIT_LIST_HEAD(&ioend->io_list); - - while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, - io_list))) { - if (!iomap_ioend_can_merge(ioend, next)) - break; - list_move_tail(&next->io_list, &ioend->io_list); - ioend->io_size += next->io_size; - } -} -EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); - -static int -iomap_ioend_compare(void *priv, const struct list_head *a, - const struct list_head *b) -{ - struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); - struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); - - if (ia->io_offset < ib->io_offset) - return -1; - if (ia->io_offset > ib->io_offset) - return 1; - return 0; -} - -void -iomap_sort_ioends(struct list_head *ioend_list) -{ - list_sort(NULL, ioend_list, iomap_ioend_compare); -} -EXPORT_SYMBOL_GPL(iomap_sort_ioends); - static void iomap_writepage_end_bio(struct bio *bio) { struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); @@ -2081,11 +1958,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, return iomap_submit_ioend(wpc, error); } EXPORT_SYMBOL_GPL(iomap_writepages); - -static int __init iomap_buffered_init(void) -{ - return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct iomap_ioend, io_bio), - BIOSET_NEED_BVECS); -} -fs_initcall(iomap_buffered_init); diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h new file mode 100644 index 0000000000000..36d5c56e073ed --- /dev/null +++ b/fs/iomap/internal.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IOMAP_INTERNAL_H +#define _IOMAP_INTERNAL_H 1 + +#define IOEND_BATCH_SIZE 4096 + +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend); + +#endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 3ff38c665c313..97d43c50cdf70 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -3,6 +3,11 @@ * Copyright (c) 2024-2025 Christoph Hellwig. */ #include +#include +#include "internal.h" + +struct bio_set iomap_ioend_bioset; +EXPORT_SYMBOL_GPL(iomap_ioend_bioset); struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio, loff_t file_offset, u16 ioend_flags) @@ -22,6 +27,120 @@ struct iomap_ioend *iomap_init_ioend(struct inode *inode, } EXPORT_SYMBOL_GPL(iomap_init_ioend); +static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) +{ + if (ioend->io_parent) { + struct bio *bio = &ioend->io_bio; + + ioend = ioend->io_parent; + bio_put(bio); + } + + if (error) + cmpxchg(&ioend->io_error, 0, error); + + if (!atomic_dec_and_test(&ioend->io_remaining)) + return 0; + return iomap_finish_ioend_buffered(ioend); +} + +/* + * Ioend completion routine for merged bios. This can only be called from task + * contexts as merged ioends can be of unbound length. Hence we have to break up + * the writeback completions into manageable chunks to avoid long scheduler + * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get + * good batch processing throughput without creating adverse scheduler latency + * conditions. + */ +void iomap_finish_ioends(struct iomap_ioend *ioend, int error) +{ + struct list_head tmp; + u32 completions; + + might_sleep(); + + list_replace_init(&ioend->io_list, &tmp); + completions = iomap_finish_ioend(ioend, error); + + while (!list_empty(&tmp)) { + if (completions > IOEND_BATCH_SIZE * 8) { + cond_resched(); + completions = 0; + } + ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); + list_del_init(&ioend->io_list); + completions += iomap_finish_ioend(ioend, error); + } +} +EXPORT_SYMBOL_GPL(iomap_finish_ioends); + +/* + * We can merge two adjacent ioends if they have the same set of work to do. + */ +static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, + struct iomap_ioend *next) +{ + if (ioend->io_bio.bi_status != next->io_bio.bi_status) + return false; + if (next->io_flags & IOMAP_IOEND_BOUNDARY) + return false; + if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) + return false; + if (ioend->io_offset + ioend->io_size != next->io_offset) + return false; + /* + * Do not merge physically discontiguous ioends. The filesystem + * completion functions will have to iterate the physical + * discontiguities even if we merge the ioends at a logical level, so + * we don't gain anything by merging physical discontiguities here. + * + * We cannot use bio->bi_iter.bi_sector here as it is modified during + * submission so does not point to the start sector of the bio at + * completion. + */ + if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) != + next->io_sector) + return false; + return true; +} + +void iomap_ioend_try_merge(struct iomap_ioend *ioend, + struct list_head *more_ioends) +{ + struct iomap_ioend *next; + + INIT_LIST_HEAD(&ioend->io_list); + + while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, + io_list))) { + if (!iomap_ioend_can_merge(ioend, next)) + break; + list_move_tail(&next->io_list, &ioend->io_list); + ioend->io_size += next->io_size; + } +} +EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); + +static int iomap_ioend_compare(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); + struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); + + if (ia->io_offset < ib->io_offset) + return -1; + if (ia->io_offset > ib->io_offset) + return 1; + return 0; +} + +void iomap_sort_ioends(struct list_head *ioend_list) +{ + list_sort(NULL, ioend_list, iomap_ioend_compare); +} +EXPORT_SYMBOL_GPL(iomap_sort_ioends); + /* * Split up to the first @max_len bytes from @ioend if the ioend covers more * than @max_len bytes. @@ -84,3 +203,11 @@ struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, return split_ioend; } EXPORT_SYMBOL_GPL(iomap_split_ioend); + +static int __init iomap_ioend_init(void) +{ + return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), + offsetof(struct iomap_ioend, io_bio), + BIOSET_NEED_BVECS); +} +fs_initcall(iomap_ioend_init); -- GitLab From ae2f33a519af3730cacd1c787ebe1f7475df5ba8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:04 +0100 Subject: [PATCH 120/934] iomap: factor out a iomap_dio_done helper Split out the struct iomap-dio level final completion from iomap_dio_bio_end_io into a helper to clean up the code and make it reusable. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-7-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 76 ++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 641649a04614e..277ece2437707 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -165,43 +165,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) cmpxchg(&dio->error, 0, ret); } -void iomap_dio_bio_end_io(struct bio *bio) +/* + * Called when dio->ref reaches zero from an I/O completion. + */ +static void iomap_dio_done(struct iomap_dio *dio) { - struct iomap_dio *dio = bio->bi_private; - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); struct kiocb *iocb = dio->iocb; - if (bio->bi_status) - iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); - if (!atomic_dec_and_test(&dio->ref)) - goto release_bio; - - /* - * Synchronous dio, task itself will handle any completion work - * that needs after IO. All we need to do is wake the task. - */ if (dio->wait_for_completion) { + /* + * Synchronous I/O, task itself will handle any completion work + * that needs after IO. All we need to do is wake the task. + */ struct task_struct *waiter = dio->submit.waiter; WRITE_ONCE(dio->submit.waiter, NULL); blk_wake_io_task(waiter); - goto release_bio; - } - - /* - * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline - */ - if (dio->flags & IOMAP_DIO_INLINE_COMP) { + } else if (dio->flags & IOMAP_DIO_INLINE_COMP) { WRITE_ONCE(iocb->private, NULL); iomap_dio_complete_work(&dio->aio.work); - goto release_bio; - } - - /* - * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule - * our completion that way to avoid an async punt to a workqueue. - */ - if (dio->flags & IOMAP_DIO_CALLER_COMP) { + } else if (dio->flags & IOMAP_DIO_CALLER_COMP) { + /* + * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then + * schedule our completion that way to avoid an async punt to a + * workqueue. + */ /* only polled IO cares about private cleared */ iocb->private = dio; iocb->dio_complete = iomap_dio_deferred_complete; @@ -219,19 +207,31 @@ void iomap_dio_bio_end_io(struct bio *bio) * issuer. */ iocb->ki_complete(iocb, 0); - goto release_bio; + } else { + struct inode *inode = file_inode(iocb->ki_filp); + + /* + * Async DIO completion that requires filesystem level + * completion work gets punted to a work queue to complete as + * the operation may require more IO to be issued to finalise + * filesystem metadata changes or guarantee data integrity. + */ + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); } +} + +void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + + if (atomic_dec_and_test(&dio->ref)) + iomap_dio_done(dio); - /* - * Async DIO completion that requires filesystem level completion work - * gets punted to a work queue to complete as the operation may require - * more IO to be issued to finalise filesystem metadata changes or - * guarantee data integrity. - */ - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); - queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, - &dio->aio.work); -release_bio: if (should_dirty) { bio_check_pages_dirty(bio); } else { -- GitLab From e523f2d4c974a819730830ce1c38834ee0cd7318 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:05 +0100 Subject: [PATCH 121/934] iomap: optionally use ioends for direct I/O struct iomap_ioend currently tracks outstanding buffered writes and has some really nice code in core iomap and XFS to merge contiguous I/Os an defer them to userspace for completion in a very efficient way. For zoned writes we'll also need a per-bio user context completion to record the written blocks, and the infrastructure for that would look basically like the ioend handling for buffered I/O. So instead of reinventing the wheel, reuse the existing infrastructure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-8-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 48 +++++++++++++++++++++++++++++++++++++++++-- fs/iomap/internal.h | 1 + fs/iomap/ioend.c | 2 ++ include/linux/iomap.h | 10 +++++---- 4 files changed, 55 insertions(+), 6 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 277ece2437707..138d246ec29d6 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2021 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. */ #include #include @@ -12,6 +12,7 @@ #include #include #include +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -20,6 +21,7 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ +#define IOMAP_DIO_NO_INVALIDATE (1U << 25) #define IOMAP_DIO_CALLER_COMP (1U << 26) #define IOMAP_DIO_INLINE_COMP (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) @@ -119,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && + !(dio->flags & IOMAP_DIO_NO_INVALIDATE)) kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); @@ -241,6 +244,47 @@ void iomap_dio_bio_end_io(struct bio *bio) } EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) +{ + struct iomap_dio *dio = ioend->io_bio.bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + u32 vec_count = ioend->io_bio.bi_vcnt; + + if (ioend->io_error) + iomap_dio_set_error(dio, ioend->io_error); + + if (atomic_dec_and_test(&dio->ref)) { + /* + * Try to avoid another context switch for the completion given + * that we are already called from the ioend completion + * workqueue, but never invalidate pages from this thread to + * avoid deadlocks with buffered I/O completions. Tough luck if + * you hit the tiny race with someone dirtying the range now + * between this check and the actual completion. + */ + if (!dio->iocb->ki_filp->f_mapping->nrpages) { + dio->flags |= IOMAP_DIO_INLINE_COMP; + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + } + dio->flags &= ~IOMAP_DIO_CALLER_COMP; + iomap_dio_done(dio); + } + + if (should_dirty) { + bio_check_pages_dirty(&ioend->io_bio); + } else { + bio_release_pages(&ioend->io_bio, false); + bio_put(&ioend->io_bio); + } + + /* + * Return the number of bvecs completed as even direct I/O completions + * do significant per-folio work and we'll still want to give up the + * CPU after a lot of completions. + */ + return vec_count; +} + static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h index 36d5c56e073ed..f6992a3bf66a4 100644 --- a/fs/iomap/internal.h +++ b/fs/iomap/internal.h @@ -5,5 +5,6 @@ #define IOEND_BATCH_SIZE 4096 u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); #endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 97d43c50cdf70..44f254ecab559 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -41,6 +41,8 @@ static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) if (!atomic_dec_and_test(&ioend->io_remaining)) return 0; + if (ioend->io_flags & IOMAP_IOEND_DIRECT) + return iomap_finish_ioend_direct(ioend); return iomap_finish_ioend_buffered(ioend); } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 90c27875e39dc..5768b9f2a1cc3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -343,20 +343,22 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, #define IOMAP_IOEND_UNWRITTEN (1U << 1) /* don't merge into previous ioend */ #define IOMAP_IOEND_BOUNDARY (1U << 2) +/* is direct I/O */ +#define IOMAP_IOEND_DIRECT (1U << 3) /* * Flags that if set on either ioend prevent the merge of two ioends. * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) */ #define IOMAP_IOEND_NOMERGE_FLAGS \ - (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN) + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) /* * Structure for writeback I/O completions. * - * File systems implementing ->submit_ioend can split a bio generated - * by iomap. In that case the parent ioend it was split from is recorded - * in ioend->io_parent. + * File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io + * for direct I/O) can split a bio generated by iomap. In that case the parent + * ioend it was split from is recorded in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ -- GitLab From d06244c60aec1d5d1589efe6b611a5b91a49465c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:06 +0100 Subject: [PATCH 122/934] iomap: add a io_private field to struct iomap_ioend Add a private data field to struct iomap_ioend so that the file system can attach information to it. Zoned XFS will use this for a pointer to the open zone. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-9-hch@lst.de Signed-off-by: Christian Brauner --- fs/iomap/ioend.c | 1 + include/linux/iomap.h | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 44f254ecab559..18894ebba6db1 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -23,6 +23,7 @@ struct iomap_ioend *iomap_init_ioend(struct inode *inode, ioend->io_offset = file_offset; ioend->io_size = bio->bi_iter.bi_size; ioend->io_sector = bio->bi_iter.bi_sector; + ioend->io_private = NULL; return ioend; } EXPORT_SYMBOL_GPL(iomap_init_ioend); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5768b9f2a1cc3..b4be07e8ec94d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -370,6 +370,7 @@ struct iomap_ioend { struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ + void *io_private; /* file system private data */ struct bio io_bio; /* MUST BE LAST! */ }; -- GitLab From 02b39c4655d52141e07e80e9b2772d96daf67ff6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:07 +0100 Subject: [PATCH 123/934] iomap: pass private data to iomap_page_mkwrite Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-10-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 4 +++- fs/xfs/xfs_file.c | 3 ++- fs/zonefs/file.c | 2 +- include/linux/iomap.h | 5 ++--- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 4abff64998feb..8c24d8611edf3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1489,11 +1489,13 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, return length; } -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private) { struct iomap_iter iter = { .inode = file_inode(vmf->vma->vm_file), .flags = IOMAP_WRITE | IOMAP_FAULT, + .private = private, }; struct folio *folio = page_folio(vmf->page); ssize_t ret; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f7a7d89c345ec..785f6bbf1406f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1511,7 +1511,8 @@ xfs_write_fault( if (IS_DAX(inode)) ret = xfs_dax_fault_locked(vmf, order, true); else - ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, + NULL); xfs_iunlock(ip, lock_mode); sb_end_pagefault(inode->i_sb); diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 35166c92420c4..42e2c0065bb30 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) /* Serialize against truncates */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL); filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b4be07e8ec94d..d528eb4d5cfe1 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -316,9 +316,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, - const struct iomap_ops *ops); - +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, struct iomap *iomap); void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, -- GitLab From c6d1b8d15450cf061648d4e36d622da9d755654a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:08 +0100 Subject: [PATCH 124/934] iomap: pass private data to iomap_zero_range Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-11-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/gfs2/bmap.c | 3 ++- fs/iomap/buffered-io.c | 6 ++++-- fs/xfs/xfs_iomap.c | 2 +- include/linux/iomap.h | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1795c4e8dbf66..366516b98b3f3 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) { BUG_ON(current->journal_info); - return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops); + return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops, + NULL); } #define GFS2_JTRUNC_REVOKES 8192 diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8c24d8611edf3..382647fda1d10 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1391,13 +1391,14 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { struct iomap_iter iter = { .inode = inode, .pos = pos, .len = len, .flags = IOMAP_ZERO, + .private = private, }; struct address_space *mapping = inode->i_mapping; unsigned int blocksize = i_blocksize(inode); @@ -1465,7 +1466,8 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, /* Block boundary? Nothing to do */ if (!off) return 0; - return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, + NULL); } EXPORT_SYMBOL_GPL(iomap_truncate_page); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 50fa3ef89f6c9..3410c55f544a8 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1497,7 +1497,7 @@ xfs_zero_range( return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); } int diff --git a/include/linux/iomap.h b/include/linux/iomap.h index d528eb4d5cfe1..eddf524ac7498 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -313,7 +313,7 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, - bool *did_zero, const struct iomap_ops *ops); + bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, -- GitLab From ddd402bbbf669c4ada106fd2e4c799e2b5745e3e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:09 +0100 Subject: [PATCH 125/934] iomap: pass private data to iomap_truncate_page Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-12-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 4 ++-- fs/xfs/xfs_iomap.c | 2 +- include/linux/iomap.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 382647fda1d10..3458f97d1b1e1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1458,7 +1458,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { unsigned int blocksize = i_blocksize(inode); unsigned int off = pos & (blocksize - 1); @@ -1467,7 +1467,7 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, if (!off) return 0; return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, - NULL); + private); } EXPORT_SYMBOL_GPL(iomap_truncate_page); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 3410c55f544a8..5dd0922fe2d1d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1512,5 +1512,5 @@ xfs_truncate_page( return dax_truncate_page(inode, pos, did_zero, &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eddf524ac7498..022d7f338c68d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -315,7 +315,7 @@ int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops); + const struct iomap_ops *ops, void *private); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, -- GitLab From 00dac020ca2a2d82c3e4057a930794cca593ea77 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 5 Feb 2025 16:30:09 -0600 Subject: [PATCH 126/934] sysv: convert sysv to use the new mount api Convert the sysv filesystem to use the new mount API. Tested by mounting some old sysv & v7 images I found in archives; there are no mount options, and no remount op, so this conversion is trivial. Signed-off-by: Eric Sandeen Link: https://lore.kernel.org/r/be08b1c1-c6d7-4e82-b457-87116879bdac@redhat.com Signed-off-by: Christian Brauner --- fs/sysv/super.c | 57 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/fs/sysv/super.c b/fs/sysv/super.c index 5c0d07ddbda25..03be9f1b78020 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "sysv.h" /* @@ -349,12 +350,13 @@ static int complete_read_super(struct super_block *sb, int silent, int size) return 1; } -static int sysv_fill_super(struct super_block *sb, void *data, int silent) +static int sysv_fill_super(struct super_block *sb, struct fs_context *fc) { struct buffer_head *bh1, *bh = NULL; struct sysv_sb_info *sbi; unsigned long blocknr; int size = 0, i; + int silent = fc->sb_flags & SB_SILENT; BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block)); BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block)); @@ -471,10 +473,11 @@ static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh) return 1; } -static int v7_fill_super(struct super_block *sb, void *data, int silent) +static int v7_fill_super(struct super_block *sb, struct fs_context *fc) { struct sysv_sb_info *sbi; struct buffer_head *bh; + int silent = fc->sb_flags & SB_SILENT; BUILD_BUG_ON(sizeof(struct v7_super_block) != 440); BUILD_BUG_ON(sizeof(struct sysv_inode) != 64); @@ -528,33 +531,51 @@ failed: /* Every kernel module contains stuff like this. */ -static struct dentry *sysv_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int sysv_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super); + return get_tree_bdev(fc, sysv_fill_super); } -static struct dentry *v7_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int v7_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super); + return get_tree_bdev(fc, v7_fill_super); +} + +static const struct fs_context_operations sysv_context_ops = { + .get_tree = sysv_get_tree, +}; + +static const struct fs_context_operations v7_context_ops = { + .get_tree = v7_get_tree, +}; + +static int sysv_init_fs_context(struct fs_context *fc) +{ + fc->ops = &sysv_context_ops; + return 0; +} + +static int v7_init_fs_context(struct fs_context *fc) +{ + fc->ops = &v7_context_ops; + return 0; } static struct file_system_type sysv_fs_type = { - .owner = THIS_MODULE, - .name = "sysv", - .mount = sysv_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "sysv", + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = sysv_init_fs_context, }; MODULE_ALIAS_FS("sysv"); static struct file_system_type v7_fs_type = { - .owner = THIS_MODULE, - .name = "v7", - .mount = v7_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "v7", + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = v7_init_fs_context, }; MODULE_ALIAS_FS("v7"); MODULE_ALIAS("v7"); -- GitLab From fb3bbcfe344e64a46574a638b051ffd78762c12d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 6 Feb 2025 16:23:14 +0100 Subject: [PATCH 127/934] exit: change the release_task() paths to call flush_sigqueue() lockless A task can block a signal, accumulate up to RLIMIT_SIGPENDING sigqueues, and exit. In this case __exit_signal()->flush_sigqueue() called with irqs disabled can trigger a hard lockup, see https://lore.kernel.org/all/20190322114917.GC28876@redhat.com/ Fortunately, after the recent posixtimer changes sys_timer_delete() paths no longer try to clear SIGQUEUE_PREALLOC and/or free tmr->sigq, and after the exiting task passes __exit_signal() lock_task_sighand() can't succeed and pid_task(tmr->it_pid) will return NULL. This means that after __exit_signal(tsk) nobody can play with tsk->pending or (if group_dead) with tsk->signal->shared_pending, so release_task() can safely call flush_sigqueue() after write_unlock_irq(&tasklist_lock). TODO: - we can probably shift posix_cpu_timers_exit() as well - do_sigaction() can hit the similar problem Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250206152314.GA14620@redhat.com Reviewed-by: Frederic Weisbecker Signed-off-by: Christian Brauner --- kernel/exit.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 3485e5fc499e4..2d7444da743dc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -200,20 +200,13 @@ static void __exit_signal(struct task_struct *tsk) __unhash_process(tsk, group_dead); write_sequnlock(&sig->stats_lock); - /* - * Do this under ->siglock, we can race with another thread - * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. - */ - flush_sigqueue(&tsk->pending); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); - if (group_dead) { - flush_sigqueue(&sig->shared_pending); + if (group_dead) tty_kref_put(tty); - } } static void delayed_put_task_struct(struct rcu_head *rhp) @@ -279,6 +272,16 @@ repeat: proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); + /* + * This task was already removed from the process/thread/pid lists + * and lock_task_sighand(p) can't succeed. Nobody else can touch + * ->pending or, if group dead, signal->shared_pending. We can call + * flush_sigqueue() lockless. + */ + flush_sigqueue(&p->pending); + if (thread_group_leader(p)) + flush_sigqueue(&p->signal->shared_pending); + put_task_struct_rcu_user(p); p = leader; -- GitLab From 43966114b49988ebca6b7d17eb68bad7740e9fb2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 6 Feb 2025 16:23:34 +0100 Subject: [PATCH 128/934] exit: kill the pointless __exit_signal()->clear_tsk_thread_flag(TIF_SIGPENDING) It predates the git history and most probably it was never needed. It doesn't really hurt, but it looks confusing because its purpose is not clear at all. release_task(p) is called when this task has already passed exit_notify() so signal_pending(p) == T shouldn't make any difference. And even _if_ there were a subtle reason to clear TIF_SIGPENDING after exit_notify(), this clear_tsk_thread_flag() can't help anyway. If the exiting task is a group leader or if it is ptraced, release_task() will be likely called when this task has already done its last schedule() from do_task_dead(). Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250206152334.GB14620@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Christian Brauner --- kernel/exit.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/exit.c b/kernel/exit.c index 2d7444da743dc..0acb94b17caae 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -204,7 +204,6 @@ static void __exit_signal(struct task_struct *tsk) spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) tty_kref_put(tty); } -- GitLab From 1ab2785694971257f6a7dbd5a71bd8402b5fc305 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 6 Feb 2025 17:44:10 +0100 Subject: [PATCH 129/934] exit: perform add_device_randomness() without tasklist_lock Parallel calls to add_device_randomness() contend on their own. The clone side aleady runs outside of tasklist_lock, which in turn means any caller on the exit side extends the tasklist_lock hold time while contending on the random-private lock. Reviewed-by: Oleg Nesterov Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250206164415.450051-2-mjguzik@gmail.com Acked-by: "Liam R. Howlett" Signed-off-by: Christian Brauner --- kernel/exit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 0acb94b17caae..a00273db024b3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -174,9 +174,6 @@ static void __exit_signal(struct task_struct *tsk) sig->curr_target = next_thread(tsk); } - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, - sizeof(unsigned long long)); - /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, @@ -270,6 +267,8 @@ repeat: write_unlock_irq(&tasklist_lock); proc_flush_pid(thread_pid); put_pid(thread_pid); + add_device_randomness(&p->se.sum_exec_runtime, + sizeof(p->se.sum_exec_runtime)); release_thread(p); /* * This task was already removed from the process/thread/pid lists -- GitLab From 6731cd97e60d6f3a30057c0fc513aed543187104 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 6 Feb 2025 17:44:11 +0100 Subject: [PATCH 130/934] exit: hoist get_pid() in release_task() outside of tasklist_lock Reduces hold time as get_pid() contains an atomic. Reviewed-by: Oleg Nesterov Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250206164415.450051-3-mjguzik@gmail.com Acked-by: "Liam R. Howlett" Signed-off-by: Christian Brauner --- kernel/exit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/exit.c b/kernel/exit.c index a00273db024b3..ebc1c5bec5122 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -240,9 +240,10 @@ repeat: cgroup_release(p); + thread_pid = get_pid(p->thread_pid); + write_lock_irq(&tasklist_lock); ptrace_release_task(p); - thread_pid = get_pid(p->thread_pid); __exit_signal(p); /* -- GitLab From 74198dc2067b2aa14e411b29ce50ea3f418a43ab Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 6 Feb 2025 17:44:12 +0100 Subject: [PATCH 131/934] pid: sprinkle tasklist_lock asserts They cost nothing on production kernels and document the requirement of holding the tasklist_lock lock in respective routines. Reviewed-by: Oleg Nesterov Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250206164415.450051-4-mjguzik@gmail.com Acked-by: "Liam R. Howlett" Signed-off-by: Christian Brauner --- kernel/pid.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/kernel/pid.c b/kernel/pid.c index 924084713be8b..2ae872f689a75 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -339,17 +339,23 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) */ void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid *pid = *task_pid_ptr(task, type); + struct pid *pid; + + lockdep_assert_held_write(&tasklist_lock); + + pid = *task_pid_ptr(task, type); hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } static void __change_pid(struct task_struct *task, enum pid_type type, struct pid *new) { - struct pid **pid_ptr = task_pid_ptr(task, type); - struct pid *pid; + struct pid **pid_ptr, *pid; int tmp; + lockdep_assert_held_write(&tasklist_lock); + + pid_ptr = task_pid_ptr(task, type); pid = *pid_ptr; hlist_del_rcu(&task->pid_links[type]); @@ -386,6 +392,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right) struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; + lockdep_assert_held_write(&tasklist_lock); + /* Swap the single entry tid lists */ hlists_swap_heads_rcu(head1, head2); @@ -403,6 +411,7 @@ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { WARN_ON_ONCE(type == PIDTYPE_PID); + lockdep_assert_held_write(&tasklist_lock); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } -- GitLab From 7903f907a226058ed99f86e9924e082aea57fc45 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 6 Feb 2025 17:44:13 +0100 Subject: [PATCH 132/934] pid: perform free_pid() calls outside of tasklist_lock As the clone side already executes pid allocation with only pidmap_lock held, issuing free_pid() while still holding tasklist_lock exacerbates total hold time of the latter. More things may show up later which require initial clean up with the lock held and allow finishing without it. For that reason a struct to collect such work is added instead of merely passing the pid array. Reviewed-by: Oleg Nesterov Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250206164415.450051-5-mjguzik@gmail.com Acked-by: "Liam R. Howlett" Signed-off-by: Christian Brauner --- include/linux/pid.h | 7 ++++--- kernel/exit.c | 28 ++++++++++++++++++++-------- kernel/pid.c | 44 ++++++++++++++++++++++---------------------- kernel/sys.c | 14 +++++++++----- 4 files changed, 55 insertions(+), 38 deletions(-) diff --git a/include/linux/pid.h b/include/linux/pid.h index 98837a1ff0f33..311ecebd7d56a 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -101,9 +101,9 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); * these helpers must be called with the tasklist_lock write-held. */ extern void attach_pid(struct task_struct *task, enum pid_type); -extern void detach_pid(struct task_struct *task, enum pid_type); -extern void change_pid(struct task_struct *task, enum pid_type, - struct pid *pid); +void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type); +void change_pid(struct pid **pids, struct task_struct *task, enum pid_type, + struct pid *pid); extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); @@ -129,6 +129,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *); extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size); extern void free_pid(struct pid *pid); +void free_pids(struct pid **pids); extern void disable_pid_allocation(struct pid_namespace *ns); /* diff --git a/kernel/exit.c b/kernel/exit.c index ebc1c5bec5122..fd960a67a4dcc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -122,14 +122,22 @@ static __init int kernel_exit_sysfs_init(void) late_initcall(kernel_exit_sysfs_init); #endif -static void __unhash_process(struct task_struct *p, bool group_dead) +/* + * For things release_task() would like to do *after* tasklist_lock is released. + */ +struct release_task_post { + struct pid *pids[PIDTYPE_MAX]; +}; + +static void __unhash_process(struct release_task_post *post, struct task_struct *p, + bool group_dead) { nr_threads--; - detach_pid(p, PIDTYPE_PID); + detach_pid(post->pids, p, PIDTYPE_PID); if (group_dead) { - detach_pid(p, PIDTYPE_TGID); - detach_pid(p, PIDTYPE_PGID); - detach_pid(p, PIDTYPE_SID); + detach_pid(post->pids, p, PIDTYPE_TGID); + detach_pid(post->pids, p, PIDTYPE_PGID); + detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); @@ -141,7 +149,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) /* * This function expects the tasklist_lock write-locked. */ -static void __exit_signal(struct task_struct *tsk) +static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); @@ -194,7 +202,7 @@ static void __exit_signal(struct task_struct *tsk) task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; - __unhash_process(tsk, group_dead); + __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); tsk->sighand = NULL; @@ -228,10 +236,13 @@ void __weak release_thread(struct task_struct *dead_task) void release_task(struct task_struct *p) { + struct release_task_post post; struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: + memset(&post, 0, sizeof(post)); + /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); @@ -244,7 +255,7 @@ repeat: write_lock_irq(&tasklist_lock); ptrace_release_task(p); - __exit_signal(p); + __exit_signal(&post, p); /* * If we are the last non-leader member of the thread @@ -270,6 +281,7 @@ repeat: put_pid(thread_pid); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); + free_pids(post.pids); release_thread(p); /* * This task was already removed from the process/thread/pid lists diff --git a/kernel/pid.c b/kernel/pid.c index 2ae872f689a75..73625f28c1662 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -88,20 +88,6 @@ struct pid_namespace init_pid_ns = { }; EXPORT_SYMBOL_GPL(init_pid_ns); -/* - * Note: disable interrupts while the pidmap_lock is held as an - * interrupt might come in and do read_lock(&tasklist_lock). - * - * If we don't disable interrupts there is a nasty deadlock between - * detach_pid()->free_pid() and another cpu that does - * spin_lock(&pidmap_lock) followed by an interrupt routine that does - * read_lock(&tasklist_lock); - * - * After we clean up the tasklist_lock and know there are no - * irq handlers that take it we can leave the interrupts enabled. - * For now it is easier to be safe than to prove it can't happen. - */ - static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); @@ -128,10 +114,11 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { - /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; unsigned long flags; + lockdep_assert_not_held(&tasklist_lock); + spin_lock_irqsave(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; @@ -160,6 +147,18 @@ void free_pid(struct pid *pid) call_rcu(&pid->rcu, delayed_put_pid); } +void free_pids(struct pid **pids) +{ + int tmp; + + /* + * This can batch pidmap_lock. + */ + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) + if (pids[tmp]) + free_pid(pids[tmp]); +} + struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size) { @@ -347,8 +346,8 @@ void attach_pid(struct task_struct *task, enum pid_type type) hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } -static void __change_pid(struct task_struct *task, enum pid_type type, - struct pid *new) +static void __change_pid(struct pid **pids, struct task_struct *task, + enum pid_type type, struct pid *new) { struct pid **pid_ptr, *pid; int tmp; @@ -370,18 +369,19 @@ static void __change_pid(struct task_struct *task, enum pid_type type, if (pid_has_task(pid, tmp)) return; - free_pid(pid); + WARN_ON(pids[type]); + pids[type] = pid; } -void detach_pid(struct task_struct *task, enum pid_type type) +void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type) { - __change_pid(task, type, NULL); + __change_pid(pids, task, type, NULL); } -void change_pid(struct task_struct *task, enum pid_type type, +void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type, struct pid *pid) { - __change_pid(task, type, pid); + __change_pid(pids, task, type, pid); attach_pid(task, type); } diff --git a/kernel/sys.c b/kernel/sys.c index cb366ff8703af..4efca8a97d620 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1085,6 +1085,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; + struct pid *pids[PIDTYPE_MAX] = { 0 }; struct pid *pgrp; int err; @@ -1142,13 +1143,14 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) goto out; if (task_pgrp(p) != pgrp) - change_pid(p, PIDTYPE_PGID, pgrp); + change_pid(pids, p, PIDTYPE_PGID, pgrp); err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); rcu_read_unlock(); + free_pids(pids); return err; } @@ -1222,21 +1224,22 @@ out: return retval; } -static void set_special_pids(struct pid *pid) +static void set_special_pids(struct pid **pids, struct pid *pid) { struct task_struct *curr = current->group_leader; if (task_session(curr) != pid) - change_pid(curr, PIDTYPE_SID, pid); + change_pid(pids, curr, PIDTYPE_SID, pid); if (task_pgrp(curr) != pid) - change_pid(curr, PIDTYPE_PGID, pid); + change_pid(pids, curr, PIDTYPE_PGID, pid); } int ksys_setsid(void) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); + struct pid *pids[PIDTYPE_MAX] = { 0 }; pid_t session = pid_vnr(sid); int err = -EPERM; @@ -1252,13 +1255,14 @@ int ksys_setsid(void) goto out; group_leader->signal->leader = 1; - set_special_pids(sid); + set_special_pids(pids, sid); proc_clear_tty(group_leader); err = session; out: write_unlock_irq(&tasklist_lock); + free_pids(pids); if (err > 0) { proc_sid_connector(group_leader); sched_autogroup_create_attach(group_leader); -- GitLab From 627454c0f6708cb79dc58332e8033b8fa904c999 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 6 Feb 2025 17:44:14 +0100 Subject: [PATCH 133/934] pid: drop irq disablement around pidmap_lock It no longer serves any purpose now that the tasklist_lock -> pidmap_lock ordering got eliminated. Reviewed-by: Oleg Nesterov Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250206164415.450051-6-mjguzik@gmail.com Acked-by: "Liam R. Howlett" Signed-off-by: Christian Brauner --- kernel/pid.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/kernel/pid.c b/kernel/pid.c index 73625f28c1662..900193de42324 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -115,11 +115,10 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { int i; - unsigned long flags; lockdep_assert_not_held(&tasklist_lock); - spin_lock_irqsave(&pidmap_lock, flags); + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; @@ -142,7 +141,7 @@ void free_pid(struct pid *pid) idr_remove(&ns->idr, upid->nr); } pidfs_remove_pid(pid); - spin_unlock_irqrestore(&pidmap_lock, flags); + spin_unlock(&pidmap_lock); call_rcu(&pid->rcu, delayed_put_pid); } @@ -210,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, } idr_preload(GFP_KERNEL); - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, @@ -237,7 +236,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max, GFP_ATOMIC); } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); if (nr < 0) { @@ -271,7 +270,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, upid = pid->numbers + ns->level; idr_preload(GFP_KERNEL); - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; pidfs_add_pid(pid); @@ -280,18 +279,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); return pid; out_unlock: - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); put_pid_ns(ns); out_free: - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); @@ -301,7 +300,7 @@ out_free: if (ns->pid_allocated == PIDNS_ADDING) idr_set_cursor(&ns->idr, 0); - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); @@ -309,9 +308,9 @@ out_free: void disable_pid_allocation(struct pid_namespace *ns) { - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); ns->pid_allocated &= ~PIDNS_ADDING; - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) -- GitLab From 6d3f8fb4b2d47b7b6581a1a5a6ce35700f1416e6 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Fri, 7 Feb 2025 15:40:51 +0900 Subject: [PATCH 134/934] sched_ext: Add an event, SCX_EV_ENQ_SLICE_DFL Add a core event, SCX_EV_ENQ_SLICE_DFL, which represents how many tasks have been enqueued (or pick_task-ed or select_cpu-ed) with a default time slice (SCX_SLICE_DFL). Scheduling a task with SCX_SLICE_DFL unintentionally would be a source of latency spikes because SCX_SLICE_DFL is relatively long (20 msec). Thus, soaring the SCX_EV_ENQ_SLICE_DFL value would be a sign of BPF scheduler bugs, causing latency spikes, especially when ops.select_cpu() is provided. __scx_add_event() is used since the caller holds an rq lock or p->pi_lock, so the preemption has already been disabled. Signed-off-by: Changwoo Min Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8a9a30895381a..5ef90d9bcdd2d 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1468,6 +1468,12 @@ struct scx_event_stats { */ u64 SCX_EV_ENQ_SKIP_EXITING; + /* + * The total number of tasks enqueued (or pick_task-ed) with a + * default time slice (SCX_SLICE_DFL). + */ + u64 SCX_EV_ENQ_SLICE_DFL; + /* * The total duration of bypass modes in nanoseconds. */ @@ -2134,6 +2140,7 @@ local: */ touch_core_sched(rq, p); p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); local_norefill: dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); return; @@ -2141,6 +2148,7 @@ local_norefill: global: touch_core_sched(rq, p); /* see the comment in local: */ p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); dispatch_enqueue(find_global_dsq(p), p, enq_flags); } @@ -3202,8 +3210,10 @@ static struct task_struct *pick_task_scx(struct rq *rq) */ if (keep_prev) { p = prev; - if (!p->scx.slice) + if (!p->scx.slice) { p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); + } } else { p = first_local_task(rq); if (!p) { @@ -3219,6 +3229,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) scx_warned_zero_slice = true; } p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); } } @@ -3306,6 +3317,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (found) { p->scx.slice = SCX_SLICE_DFL; p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); } if (rq_bypass) @@ -5023,6 +5035,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); + scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL); scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); @@ -7163,6 +7176,7 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL); scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION); scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH); scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE); -- GitLab From 38d65cd692a26e09152bcec67d641a7914a5cba6 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Fri, 7 Feb 2025 15:40:52 +0900 Subject: [PATCH 135/934] sched_ext: Print an event, SCX_EV_ENQ_SLICE_DFL, in scx_qmap/central Modify the scx_qmap and scx_celtral schedulers to print the SCX_EV_ENQ_SLICE_DFL event every second. Signed-off-by: Changwoo Min Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/sched_ext/scx_central.bpf.c | 2 ++ tools/sched_ext/scx_qmap.bpf.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 376c14d5dd0d9..907a844723c12 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -305,6 +305,8 @@ static int central_timerfn(void *map, int *key, struct bpf_timer *timer) scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); bpf_printk("%35s: %llu\n", "SCX_EV_ENQ_SKIP_EXITING", scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); + bpf_printk("%35s: %llu\n", "SCX_EV_ENQ_SLICE_DFL", + scx_read_event(&events, SCX_EV_ENQ_SLICE_DFL)); bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_DURATION", scx_read_event(&events, SCX_EV_BYPASS_DURATION)); bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_DISPATCH", diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 5edb79742e379..7d9d1e5d23589 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -784,6 +784,8 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); bpf_printk("%35s: %llu\n", "SCX_EV_ENQ_SKIP_EXITING", scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); + bpf_printk("%35s: %llu\n", "SCX_EV_ENQ_SLICE_DFL", + scx_read_event(&events, SCX_EV_ENQ_SLICE_DFL)); bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_DURATION", scx_read_event(&events, SCX_EV_BYPASS_DURATION)); bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_DISPATCH", -- GitLab From da8d493a80993972c427002684d0742560f3be4a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 20 Jan 2025 16:10:00 +0100 Subject: [PATCH 136/934] firmware: qcom: uefisecapp: fix efivars registration race Since the conversion to using the TZ allocator, the efivars service is registered before the memory pool has been allocated, something which can lead to a NULL-pointer dereference in case of a racing EFI variable access. Make sure that all resources have been set up before registering the efivars. Fixes: 6612103ec35a ("firmware: qcom: qseecom: convert to using the TZ allocator") Cc: stable@vger.kernel.org # 6.11 Cc: Bartosz Golaszewski Signed-off-by: Johan Hovold Reviewed-by: Konrad Dybcio Reviewed-by: Maximilian Luz Reviewed-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20250120151000.13870-1-johan+linaro@kernel.org Signed-off-by: Bjorn Andersson --- .../firmware/qcom/qcom_qseecom_uefisecapp.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/firmware/qcom/qcom_qseecom_uefisecapp.c b/drivers/firmware/qcom/qcom_qseecom_uefisecapp.c index 447246bd04be3..98a463e9774bf 100644 --- a/drivers/firmware/qcom/qcom_qseecom_uefisecapp.c +++ b/drivers/firmware/qcom/qcom_qseecom_uefisecapp.c @@ -814,15 +814,6 @@ static int qcom_uefisecapp_probe(struct auxiliary_device *aux_dev, qcuefi->client = container_of(aux_dev, struct qseecom_client, aux_dev); - auxiliary_set_drvdata(aux_dev, qcuefi); - status = qcuefi_set_reference(qcuefi); - if (status) - return status; - - status = efivars_register(&qcuefi->efivars, &qcom_efivar_ops); - if (status) - qcuefi_set_reference(NULL); - memset(&pool_config, 0, sizeof(pool_config)); pool_config.initial_size = SZ_4K; pool_config.policy = QCOM_TZMEM_POLICY_MULTIPLIER; @@ -833,6 +824,15 @@ static int qcom_uefisecapp_probe(struct auxiliary_device *aux_dev, if (IS_ERR(qcuefi->mempool)) return PTR_ERR(qcuefi->mempool); + auxiliary_set_drvdata(aux_dev, qcuefi); + status = qcuefi_set_reference(qcuefi); + if (status) + return status; + + status = efivars_register(&qcuefi->efivars, &qcom_efivar_ops); + if (status) + qcuefi_set_reference(NULL); + return status; } -- GitLab From 548b0c5de7619ef53bbde5590700693f2f6d2a56 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 2 Feb 2025 17:04:13 +0100 Subject: [PATCH 137/934] batman-adv: Ignore own maximum aggregation size during RX An OGMv1 and OGMv2 packet receive processing were not only limited by the number of bytes in the received packet but also by the nodes maximum aggregation packet size limit. But this limit is relevant for TX and not for RX. It must not be enforced by batadv_(i)v_ogm_aggr_packet to avoid loss of information in case of a different limit for sender and receiver. This has a minor side effect for B.A.T.M.A.N. IV because the batadv_iv_ogm_aggr_packet is also used for the preprocessing for the TX. But since the aggregation code itself will not allow more than BATADV_MAX_AGGREGATION_BYTES bytes, this check was never triggering (in this context) prior of removing it. Cc: stable@vger.kernel.org Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Fixes: 9323158ef9f4 ("batman-adv: OGMv2 - implement originators logic") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 3 +-- net/batman-adv/bat_v_ogm.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 07ae5dd1f150b..b12645949ae5a 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -325,8 +325,7 @@ batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm_packet->tvlv_len); - return (next_buff_pos <= packet_len) && - (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); + return next_buff_pos <= packet_len; } /* send a batman ogm to a given interface */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index e503ee0d896bd..8f89ffe6020ce 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -839,8 +839,7 @@ batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm2_packet->tvlv_len); - return (next_buff_pos <= packet_len) && - (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); + return next_buff_pos <= packet_len; } /** -- GitLab From 46a0e1615886b87bdb18cf3f29bf494a30aafc01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2025 20:39:11 -1000 Subject: [PATCH 138/934] tool/sched_ext: Event counter dumping updates - There's no need to dump event counters from both scx_qmap and scx_central. Drop counter dumping from scx_central. - bpf_printk() implies a trailing new line and the explicit new line leads to double new lines. Drop the explicit new lines. Signed-off-by: Tejun Heo Acked-by: Changwoo Min --- tools/sched_ext/scx_central.bpf.c | 21 --------------------- tools/sched_ext/scx_qmap.bpf.c | 16 ++++++++-------- 2 files changed, 8 insertions(+), 29 deletions(-) diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 907a844723c12..5c165af1fa27c 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -293,27 +293,6 @@ static int central_timerfn(void *map, int *key, struct bpf_timer *timer) bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); __sync_fetch_and_add(&nr_timers, 1); - /* print event counters every second */ - if (nr_timers % 1000 == 0) { - scx_bpf_events(&events, sizeof(events)); - - bpf_printk("%35s: %llu\n", "SCX_EV_SELECT_CPU_FALLBACK", - scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); - bpf_printk("%35s: %llu\n", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", - scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); - bpf_printk("%35s: %llu\n", "SCX_EV_DISPATCH_KEEP_LAST", - scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); - bpf_printk("%35s: %llu\n", "SCX_EV_ENQ_SKIP_EXITING", - scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); - bpf_printk("%35s: %llu\n", "SCX_EV_ENQ_SLICE_DFL", - scx_read_event(&events, SCX_EV_ENQ_SLICE_DFL)); - bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_DURATION", - scx_read_event(&events, SCX_EV_BYPASS_DURATION)); - bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_DISPATCH", - scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); - bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_ACTIVATE", - scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); - } return 0; } diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 7d9d1e5d23589..e0e766d402e1c 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -776,21 +776,21 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) scx_bpf_events(&events, sizeof(events)); - bpf_printk("%35s: %llu\n", "SCX_EV_SELECT_CPU_FALLBACK", + bpf_printk("%35s: %llu", "SCX_EV_SELECT_CPU_FALLBACK", scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); - bpf_printk("%35s: %llu\n", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", + bpf_printk("%35s: %llu", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); - bpf_printk("%35s: %llu\n", "SCX_EV_DISPATCH_KEEP_LAST", + bpf_printk("%35s: %llu", "SCX_EV_DISPATCH_KEEP_LAST", scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); - bpf_printk("%35s: %llu\n", "SCX_EV_ENQ_SKIP_EXITING", + bpf_printk("%35s: %llu", "SCX_EV_ENQ_SKIP_EXITING", scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); - bpf_printk("%35s: %llu\n", "SCX_EV_ENQ_SLICE_DFL", + bpf_printk("%35s: %llu", "SCX_EV_ENQ_SLICE_DFL", scx_read_event(&events, SCX_EV_ENQ_SLICE_DFL)); - bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_DURATION", + bpf_printk("%35s: %llu", "SCX_EV_BYPASS_DURATION", scx_read_event(&events, SCX_EV_BYPASS_DURATION)); - bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_DISPATCH", + bpf_printk("%35s: %llu", "SCX_EV_BYPASS_DISPATCH", scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); - bpf_printk("%35s: %llu\n", "SCX_EV_BYPASS_ACTIVATE", + bpf_printk("%35s: %llu", "SCX_EV_BYPASS_ACTIVATE", scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); bpf_timer_start(timer, ONE_SEC_IN_NS, 0); -- GitLab From 26176116d931621fd96bcef15e7b42a9ddf524da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2025 20:39:11 -1000 Subject: [PATCH 139/934] sched_ext: Count SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE in the right spot SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE wasn't quite right in two aspects: - It counted both migration disabled and offline events. - It didn't count events from scx_bpf_dsq_move() path. Fix it by moving the counting into task_can_run_on_remote_rq() which is shared by both paths and can distinguish the different rejection conditions. The argument @trigger_error is renamed to @enforce as it now does more than just triggering error. Signed-off-by: Tejun Heo Acked-by: Changwoo Min --- kernel/sched/ext.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 5dfcba6adcdaf..310db8484b776 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2431,7 +2431,7 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, * The caller must ensure that @p and @rq are on different CPUs. */ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, - bool trigger_error) + bool enforce) { int cpu = cpu_of(rq); @@ -2444,7 +2444,7 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, * picked CPU is outside the allowed mask. */ if (!task_allowed_on_cpu(p, cpu)) { - if (trigger_error) + if (enforce) scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", cpu_of(rq), p->comm, p->pid); return false; @@ -2456,8 +2456,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, */ SCHED_WARN_ON(is_migration_disabled(p)); - if (!scx_rq_online(rq)) + if (!scx_rq_online(rq)) { + if (enforce) + __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; + } return true; } @@ -2527,7 +2530,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, } #else /* CONFIG_SMP */ static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } -static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } +static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; } static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } #endif /* CONFIG_SMP */ @@ -2717,7 +2720,6 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dispatch_enqueue(find_global_dsq(p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); - __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return; } -- GitLab From eace54dff05157ed2629848111366b836dcf3ad9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2025 20:39:11 -1000 Subject: [PATCH 140/934] sched_ext: Add SCX_EV_ENQ_SKIP_MIGRATION_DISABLED Count the number of times a migration disabled task is automatically dispatched to its local DSQ. Signed-off-by: Tejun Heo Acked-by: Changwoo Min --- kernel/sched/ext.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 310db8484b776..b2378e29f45aa 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1483,6 +1483,13 @@ struct scx_event_stats { */ u64 SCX_EV_ENQ_SKIP_EXITING; + /* + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a + * migration disabled task skips ops.enqueue() and is dispatched to its + * local DSQ. + */ + u64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + /* * The total number of tasks enqueued (or pick_task-ed) with a * default time slice (SCX_SLICE_DFL). @@ -2119,8 +2126,10 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && - is_migration_disabled(p)) + is_migration_disabled(p)) { + __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); goto local; + } if (!SCX_HAS_OP(enqueue)) goto global; @@ -5067,6 +5076,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL); scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); @@ -7210,6 +7220,7 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL); scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION); scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH); -- GitLab From 372033ad9e98dce205fc3c48a146cfff5699e3e9 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Sun, 9 Feb 2025 10:53:53 +0900 Subject: [PATCH 141/934] tools/sched_ext: Compatible testing of SCX_ENQ_CPU_SELECTED This provides compatible testing of SCX_ENQ_CPU_SELECTED. More specifically, it handles two cases: 1. a BPF scheduler is compiled against vmlinux.h where SCX_ENQ_CPU_SELECTED is defined, but it runs on a kernel that does not have SCX_ENQ_CPU_SELECTED. In this case, the test result of 'enq_flags & SCX_ENQ_CPU_SELECTED' will always be false. That test result is semantically incorrect because the kernel before SCX_ENQ_CPU_SELECTED has never skipped select_task_rq_scx(), so the result should be true. 2. a BPF scheduler is compiling against vmlinux.h where SCX_ENQ_CPU_SELECTED is not defined. In this case, directly using SCX_ENQ_CPU_SELECTED causes compilation errors. To hide such complexity, introduce __COMPAT_is_enq_cpu_selected(), which checks if SCX_ENQ_CPU_SELECTED exists in runtime using BPF CO-RE. This consists of three parts: 1. Add enum_defs.autogen.h, which has macros (HAVE_{enum name}) denoting whether SCX enums are defined in the vmlinux.h or not. 2. Implement __COMPAT_is_enq_cpu_selected(), which provide the test of SCX_ENQ_CPU_SELECTED in a compatible way. 3. Use __COMPAT_is_enq_cpu_selected() in scx_qmap. Note that this is a sync of the relevant PR [1] in the scx repo. [1] https://github.com/sched-ext/scx/pull/1314 Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/common.bpf.h | 1 + tools/sched_ext/include/scx/common.h | 1 + tools/sched_ext/include/scx/compat.bpf.h | 52 ++++++++ .../sched_ext/include/scx/enum_defs.autogen.h | 119 ++++++++++++++++++ tools/sched_ext/scx_qmap.bpf.c | 2 +- 5 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 tools/sched_ext/include/scx/enum_defs.autogen.h diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index ae717f4d6ede1..f1caf9fc8f8cc 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -18,6 +18,7 @@ #include #include #include "user_exit_info.h" +#include "enum_defs.autogen.h" #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h index dc18b99e55cd1..1dc76bd842966 100644 --- a/tools/sched_ext/include/scx/common.h +++ b/tools/sched_ext/include/scx/common.h @@ -16,6 +16,7 @@ #include #include #include +#include "enum_defs.autogen.h" typedef uint8_t u8; typedef uint16_t u16; diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 50e1499ae0935..e5fa72f9bf22b 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -125,6 +125,58 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, false; \ }) +/** + * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on + * in a compatible way. We will preserve this __COMPAT helper until v6.16. + * + * @enq_flags: enqueue flags from ops.enqueue() + * + * Return: True if SCX_ENQ_CPU_SELECTED is turned on in @enq_flags + */ +static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags) +{ +#ifdef HAVE_SCX_ENQ_CPU_SELECTED + /* + * This is the case that a BPF code compiled against vmlinux.h + * where the enum SCX_ENQ_CPU_SELECTED exists. + */ + + /* + * We should temporarily suspend the macro expansion of + * 'SCX_ENQ_CPU_SELECTED'. This avoids 'SCX_ENQ_CPU_SELECTED' being + * rewritten to '__SCX_ENQ_CPU_SELECTED' when 'SCX_ENQ_CPU_SELECTED' + * is defined in 'scripts/gen_enums.py'. + */ +#pragma push_macro("SCX_ENQ_CPU_SELECTED") +#undef SCX_ENQ_CPU_SELECTED + u64 flag; + + /* + * When the kernel did not have SCX_ENQ_CPU_SELECTED, + * select_task_rq_scx() has never been skipped. Thus, this case + * should be considered that the CPU has already been selected. + */ + if (!bpf_core_enum_value_exists(enum scx_enq_flags, + SCX_ENQ_CPU_SELECTED)) + return true; + + flag = bpf_core_enum_value(enum scx_enq_flags, SCX_ENQ_CPU_SELECTED); + return enq_flags & flag; + + /* + * Once done, resume the macro expansion of 'SCX_ENQ_CPU_SELECTED'. + */ +#pragma pop_macro("SCX_ENQ_CPU_SELECTED") +#else + /* + * This is the case that a BPF code compiled against vmlinux.h + * where the enum SCX_ENQ_CPU_SELECTED does NOT exist. + */ + return true; +#endif /* HAVE_SCX_ENQ_CPU_SELECTED */ +} + + #define scx_bpf_now() \ (bpf_ksym_exists(scx_bpf_now) ? \ scx_bpf_now() : \ diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h new file mode 100644 index 0000000000000..5551816856ee7 --- /dev/null +++ b/tools/sched_ext/include/scx/enum_defs.autogen.h @@ -0,0 +1,119 @@ +/* + * WARNING: This file is autogenerated from scripts/gen_enum_defs.py. + */ + +#ifndef __ENUM_DEFS_AUTOGEN_H__ +#define __ENUM_DEFS_AUTOGEN_H__ + + +#define HAVE_SCX_DSP_DFL_MAX_BATCH +#define HAVE_SCX_DSP_MAX_LOOPS +#define HAVE_SCX_WATCHDOG_MAX_TIMEOUT +#define HAVE_SCX_EXIT_BT_LEN +#define HAVE_SCX_EXIT_MSG_LEN +#define HAVE_SCX_EXIT_DUMP_DFL_LEN +#define HAVE_SCX_CPUPERF_ONE +#define HAVE_SCX_OPS_TASK_ITER_BATCH +#define HAVE_SCX_CPU_PREEMPT_RT +#define HAVE_SCX_CPU_PREEMPT_DL +#define HAVE_SCX_CPU_PREEMPT_STOP +#define HAVE_SCX_CPU_PREEMPT_UNKNOWN +#define HAVE_SCX_DEQ_SLEEP +#define HAVE_SCX_DEQ_CORE_SCHED_EXEC +#define HAVE_SCX_DSQ_FLAG_BUILTIN +#define HAVE_SCX_DSQ_FLAG_LOCAL_ON +#define HAVE_SCX_DSQ_INVALID +#define HAVE_SCX_DSQ_GLOBAL +#define HAVE_SCX_DSQ_LOCAL +#define HAVE_SCX_DSQ_LOCAL_ON +#define HAVE_SCX_DSQ_LOCAL_CPU_MASK +#define HAVE_SCX_DSQ_ITER_REV +#define HAVE___SCX_DSQ_ITER_HAS_SLICE +#define HAVE___SCX_DSQ_ITER_HAS_VTIME +#define HAVE___SCX_DSQ_ITER_USER_FLAGS +#define HAVE___SCX_DSQ_ITER_ALL_FLAGS +#define HAVE_SCX_DSQ_LNODE_ITER_CURSOR +#define HAVE___SCX_DSQ_LNODE_PRIV_SHIFT +#define HAVE_SCX_ENQ_WAKEUP +#define HAVE_SCX_ENQ_HEAD +#define HAVE_SCX_ENQ_CPU_SELECTED +#define HAVE_SCX_ENQ_PREEMPT +#define HAVE_SCX_ENQ_REENQ +#define HAVE_SCX_ENQ_LAST +#define HAVE___SCX_ENQ_INTERNAL_MASK +#define HAVE_SCX_ENQ_CLEAR_OPSS +#define HAVE_SCX_ENQ_DSQ_PRIQ +#define HAVE_SCX_TASK_DSQ_ON_PRIQ +#define HAVE_SCX_TASK_QUEUED +#define HAVE_SCX_TASK_RESET_RUNNABLE_AT +#define HAVE_SCX_TASK_DEQD_FOR_SLEEP +#define HAVE_SCX_TASK_STATE_SHIFT +#define HAVE_SCX_TASK_STATE_BITS +#define HAVE_SCX_TASK_STATE_MASK +#define HAVE_SCX_TASK_CURSOR +#define HAVE_SCX_ECODE_RSN_HOTPLUG +#define HAVE_SCX_ECODE_ACT_RESTART +#define HAVE_SCX_EXIT_NONE +#define HAVE_SCX_EXIT_DONE +#define HAVE_SCX_EXIT_UNREG +#define HAVE_SCX_EXIT_UNREG_BPF +#define HAVE_SCX_EXIT_UNREG_KERN +#define HAVE_SCX_EXIT_SYSRQ +#define HAVE_SCX_EXIT_ERROR +#define HAVE_SCX_EXIT_ERROR_BPF +#define HAVE_SCX_EXIT_ERROR_STALL +#define HAVE_SCX_KF_UNLOCKED +#define HAVE_SCX_KF_CPU_RELEASE +#define HAVE_SCX_KF_DISPATCH +#define HAVE_SCX_KF_ENQUEUE +#define HAVE_SCX_KF_SELECT_CPU +#define HAVE_SCX_KF_REST +#define HAVE___SCX_KF_RQ_LOCKED +#define HAVE___SCX_KF_TERMINAL +#define HAVE_SCX_KICK_IDLE +#define HAVE_SCX_KICK_PREEMPT +#define HAVE_SCX_KICK_WAIT +#define HAVE_SCX_OPI_BEGIN +#define HAVE_SCX_OPI_NORMAL_BEGIN +#define HAVE_SCX_OPI_NORMAL_END +#define HAVE_SCX_OPI_CPU_HOTPLUG_BEGIN +#define HAVE_SCX_OPI_CPU_HOTPLUG_END +#define HAVE_SCX_OPI_END +#define HAVE_SCX_OPS_ENABLING +#define HAVE_SCX_OPS_ENABLED +#define HAVE_SCX_OPS_DISABLING +#define HAVE_SCX_OPS_DISABLED +#define HAVE_SCX_OPS_KEEP_BUILTIN_IDLE +#define HAVE_SCX_OPS_ENQ_LAST +#define HAVE_SCX_OPS_ENQ_EXITING +#define HAVE_SCX_OPS_SWITCH_PARTIAL +#define HAVE_SCX_OPS_HAS_CGROUP_WEIGHT +#define HAVE_SCX_OPS_ALL_FLAGS +#define HAVE_SCX_OPSS_NONE +#define HAVE_SCX_OPSS_QUEUEING +#define HAVE_SCX_OPSS_QUEUED +#define HAVE_SCX_OPSS_DISPATCHING +#define HAVE_SCX_OPSS_QSEQ_SHIFT +#define HAVE_SCX_PICK_IDLE_CORE +#define HAVE_SCX_OPS_NAME_LEN +#define HAVE_SCX_SLICE_DFL +#define HAVE_SCX_SLICE_INF +#define HAVE_SCX_RQ_ONLINE +#define HAVE_SCX_RQ_CAN_STOP_TICK +#define HAVE_SCX_RQ_BAL_PENDING +#define HAVE_SCX_RQ_BAL_KEEP +#define HAVE_SCX_RQ_BYPASSING +#define HAVE_SCX_RQ_IN_WAKEUP +#define HAVE_SCX_RQ_IN_BALANCE +#define HAVE_SCX_TASK_NONE +#define HAVE_SCX_TASK_INIT +#define HAVE_SCX_TASK_READY +#define HAVE_SCX_TASK_ENABLED +#define HAVE_SCX_TASK_NR_STATES +#define HAVE_SCX_TG_ONLINE +#define HAVE_SCX_TG_INITED +#define HAVE_SCX_WAKE_FORK +#define HAVE_SCX_WAKE_TTWU +#define HAVE_SCX_WAKE_SYNC + +#endif /* __ENUM_DEFS_AUTOGEN_H__ */ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index e0e766d402e1c..a6c6be308315a 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -231,7 +231,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) } /* if select_cpu() wasn't called, try direct dispatch */ - if (!(enq_flags & SCX_ENQ_CPU_SELECTED) && + if (!__COMPAT_is_enq_cpu_selected(enq_flags) && (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { __sync_fetch_and_add(&nr_ddsp_from_enq, 1); scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); -- GitLab From dde5625d4d75787a59e95f4d7d6983714bfc14c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 9 Feb 2025 14:25:45 +0100 Subject: [PATCH 142/934] tools/nolibc: add support for sys_llseek() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not all architectures have the old sys_lseek(), notably riscv32. Implement lseek() in terms of sys_llseek() in that case. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250209-nolibc-dir-v2-1-57cc1da8558b@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/sys.h | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 8c0a55bc9dc3a..8f44c33b12130 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -597,10 +597,37 @@ off_t sys_lseek(int fd, off_t offset, int whence) #endif } +static __attribute__((unused)) +int sys_llseek(int fd, unsigned long offset_high, unsigned long offset_low, + __kernel_loff_t *result, int whence) +{ +#ifdef __NR_llseek + return my_syscall5(__NR_llseek, fd, offset_high, offset_low, result, whence); +#else + return __nolibc_enosys(__func__, fd, offset_high, offset_low, result, whence); +#endif +} + static __attribute__((unused)) off_t lseek(int fd, off_t offset, int whence) { - return __sysret(sys_lseek(fd, offset, whence)); + __kernel_loff_t loff = 0; + off_t result; + int ret; + + result = sys_lseek(fd, offset, whence); + if (result == -ENOSYS) { + /* Only exists on 32bit where nolibc off_t is also 32bit */ + ret = sys_llseek(fd, 0, offset, &loff, whence); + if (ret < 0) + result = ret; + else if (loff != (off_t)loff) + result = -EOVERFLOW; + else + result = loff; + } + + return __sysret(result); } -- GitLab From 665fa8dea90d9fbc0e7137c7e1315d6f7e15757e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 9 Feb 2025 14:25:46 +0100 Subject: [PATCH 143/934] tools/nolibc: add support for directory access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an implementation for directory access operations. To keep nolibc itself allocation-free, a "DIR *" does not point to any data, but directly encodes a filedescriptor number, equivalent to "FILE *". Without any per-directory storage it is not possible to implement readdir() POSIX confirming. Instead only readdir_r() is provided. While readdir_r() is deprecated in glibc, the reasons for that are not applicable to nolibc. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250209-nolibc-dir-v2-2-57cc1da8558b@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/Makefile | 1 + tools/include/nolibc/dirent.h | 98 ++++++++++++++++++++ tools/include/nolibc/nolibc.h | 1 + tools/testing/selftests/nolibc/nolibc-test.c | 39 ++++++++ 4 files changed, 139 insertions(+) create mode 100644 tools/include/nolibc/dirent.h diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index a1f55fb24bb38..dceec0e1a1351 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -29,6 +29,7 @@ all_files := \ compiler.h \ crt.h \ ctype.h \ + dirent.h \ errno.h \ nolibc.h \ signal.h \ diff --git a/tools/include/nolibc/dirent.h b/tools/include/nolibc/dirent.h new file mode 100644 index 0000000000000..c5c30d0dd6806 --- /dev/null +++ b/tools/include/nolibc/dirent.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Directory access for NOLIBC + * Copyright (C) 2025 Thomas Weißschuh + */ + +#ifndef _NOLIBC_DIRENT_H +#define _NOLIBC_DIRENT_H + +#include "stdint.h" +#include "types.h" + +#include + +struct dirent { + ino_t d_ino; + char d_name[NAME_MAX + 1]; +}; + +/* See comment of FILE in stdio.h */ +typedef struct { + char dummy[1]; +} DIR; + +static __attribute__((unused)) +DIR *fdopendir(int fd) +{ + if (fd < 0) { + SET_ERRNO(EBADF); + return NULL; + } + return (DIR *)(intptr_t)~fd; +} + +static __attribute__((unused)) +DIR *opendir(const char *name) +{ + int fd; + + fd = open(name, O_RDONLY); + if (fd == -1) + return NULL; + return fdopendir(fd); +} + +static __attribute__((unused)) +int closedir(DIR *dirp) +{ + intptr_t i = (intptr_t)dirp; + + if (i >= 0) { + SET_ERRNO(EBADF); + return -1; + } + return close(~i); +} + +static __attribute__((unused)) +int readdir_r(DIR *dirp, struct dirent *entry, struct dirent **result) +{ + char buf[sizeof(struct linux_dirent64) + NAME_MAX + 1]; + struct linux_dirent64 *ldir = (void *)buf; + intptr_t i = (intptr_t)dirp; + int fd, ret; + + if (i >= 0) + return EBADF; + + fd = ~i; + + ret = sys_getdents64(fd, ldir, sizeof(buf)); + if (ret < 0) + return -ret; + if (ret == 0) { + *result = NULL; + return 0; + } + + /* + * getdents64() returns as many entries as fit the buffer. + * readdir() can only return one entry at a time. + * Make sure the non-returned ones are not skipped. + */ + ret = lseek(fd, ldir->d_off, SEEK_SET); + if (ret == -1) + return errno; + + entry->d_ino = ldir->d_ino; + /* the destination should always be big enough */ + strlcpy(entry->d_name, ldir->d_name, sizeof(entry->d_name)); + *result = entry; + return 0; +} + +/* make sure to include all global symbols */ +#include "nolibc.h" + +#endif /* _NOLIBC_DIRENT_H */ diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 92436b1e44413..05d92afedb725 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -105,6 +105,7 @@ #include "string.h" #include "time.h" #include "stackprotector.h" +#include "dirent.h" /* Used by programs to avoid std includes */ #define NOLIBC diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index f162793b162f9..798fbdcd3ff8c 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -769,6 +769,44 @@ int test_getdents64(const char *dir) return ret; } +static int test_dirent(void) +{ + int comm = 0, cmdline = 0; + struct dirent dirent, *result; + DIR *dir; + int ret; + + dir = opendir("/proc/self"); + if (!dir) + return 1; + + while (1) { + errno = 0; + ret = readdir_r(dir, &dirent, &result); + if (ret != 0) + return 1; + if (!result) + break; + + if (strcmp(dirent.d_name, "comm") == 0) + comm++; + else if (strcmp(dirent.d_name, "cmdline") == 0) + cmdline++; + } + + if (errno) + return 1; + + ret = closedir(dir); + if (ret) + return 1; + + if (comm != 1 || cmdline != 1) + return 1; + + return 0; +} + int test_getpagesize(void) { int x = getpagesize(); @@ -1061,6 +1099,7 @@ int run_syscall(int min, int max) CASE_TEST(fork); EXPECT_SYSZR(1, test_fork()); break; CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break; CASE_TEST(getdents64_null); EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break; + CASE_TEST(directories); EXPECT_SYSZR(proc, test_dirent()); break; CASE_TEST(gettimeofday_tv); EXPECT_SYSZR(1, gettimeofday(&tv, NULL)); break; CASE_TEST(gettimeofday_tv_tz);EXPECT_SYSZR(1, gettimeofday(&tv, &tz)); break; CASE_TEST(getpagesize); EXPECT_SYSZR(1, test_getpagesize()); break; -- GitLab From 9748cb2dc393e9095c39d6de0786c7b4e07b2530 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 6 Feb 2025 16:42:43 +1100 Subject: [PATCH 144/934] VFS: repack DENTRY_ flags. Bits 13, 23, 24, and 27 are not used. Move all those holes to the end. Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250206054504.2950516-7-neilb@suse.de Signed-off-by: Christian Brauner --- include/linux/dcache.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4afb603656756..bb7216b220910 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -203,34 +203,34 @@ struct dentry_operations { #define DCACHE_NFSFS_RENAMED BIT(12) /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ -#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(14) +#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(13) /* Parent inode is watched by some fsnotify listener */ -#define DCACHE_DENTRY_KILLED BIT(15) +#define DCACHE_DENTRY_KILLED BIT(14) -#define DCACHE_MOUNTED BIT(16) /* is a mountpoint */ -#define DCACHE_NEED_AUTOMOUNT BIT(17) /* handle automount on this dir */ -#define DCACHE_MANAGE_TRANSIT BIT(18) /* manage transit from this dirent */ +#define DCACHE_MOUNTED BIT(15) /* is a mountpoint */ +#define DCACHE_NEED_AUTOMOUNT BIT(16) /* handle automount on this dir */ +#define DCACHE_MANAGE_TRANSIT BIT(17) /* manage transit from this dirent */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) -#define DCACHE_LRU_LIST BIT(19) +#define DCACHE_LRU_LIST BIT(18) -#define DCACHE_ENTRY_TYPE (7 << 20) /* bits 20..22 are for storing type: */ -#define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry */ -#define DCACHE_WHITEOUT_TYPE (1 << 20) /* Whiteout dentry (stop pathwalk) */ -#define DCACHE_DIRECTORY_TYPE (2 << 20) /* Normal directory */ -#define DCACHE_AUTODIR_TYPE (3 << 20) /* Lookupless directory (presumed automount) */ -#define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type */ -#define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type */ -#define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink */ +#define DCACHE_ENTRY_TYPE (7 << 19) /* bits 19..21 are for storing type: */ +#define DCACHE_MISS_TYPE (0 << 19) /* Negative dentry */ +#define DCACHE_WHITEOUT_TYPE (1 << 19) /* Whiteout dentry (stop pathwalk) */ +#define DCACHE_DIRECTORY_TYPE (2 << 19) /* Normal directory */ +#define DCACHE_AUTODIR_TYPE (3 << 19) /* Lookupless directory (presumed automount) */ +#define DCACHE_REGULAR_TYPE (4 << 19) /* Regular file type */ +#define DCACHE_SPECIAL_TYPE (5 << 19) /* Other file type */ +#define DCACHE_SYMLINK_TYPE (6 << 19) /* Symlink */ -#define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */ -#define DCACHE_OP_REAL BIT(26) +#define DCACHE_NOKEY_NAME BIT(22) /* Encrypted name encoded without key */ +#define DCACHE_OP_REAL BIT(23) -#define DCACHE_PAR_LOOKUP BIT(28) /* being looked up (with parent locked shared) */ -#define DCACHE_DENTRY_CURSOR BIT(29) -#define DCACHE_NORCU BIT(30) /* No RCU delay for freeing */ +#define DCACHE_PAR_LOOKUP BIT(24) /* being looked up (with parent locked shared) */ +#define DCACHE_DENTRY_CURSOR BIT(25) +#define DCACHE_NORCU BIT(26) /* No RCU delay for freeing */ extern seqlock_t rename_lock; -- GitLab From 2c3230fb8db9bf04d97a907f2fb86adb1e74e431 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 6 Feb 2025 16:42:44 +1100 Subject: [PATCH 145/934] VFS: repack LOOKUP_ bit flags. The LOOKUP_ bits are not in order, which can make it awkward when adding new bits. Two bits have recently been added to the end which makes them look like "scoping flags", but in fact they aren't. Also LOOKUP_PARENT is described as "internal use only" but is used in fs/nfs/ This patch: - Moves these three flags into the "pathwalk mode" section - changes all bits to use the BIT(n) macro - Allocates bits in order leaving gaps between the sections, and documents those gaps. Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250206054504.2950516-8-neilb@suse.de Signed-off-by: Christian Brauner --- include/linux/namei.h | 45 ++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/include/linux/namei.h b/include/linux/namei.h index 8ec8fed3bce81..e3042176cdf48 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -18,35 +18,36 @@ enum { MAX_NESTED_LINKS = 8 }; enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; /* pathwalk mode */ -#define LOOKUP_FOLLOW 0x0001 /* follow links at the end */ -#define LOOKUP_DIRECTORY 0x0002 /* require a directory */ -#define LOOKUP_AUTOMOUNT 0x0004 /* force terminal automount */ -#define LOOKUP_EMPTY 0x4000 /* accept empty path [user_... only] */ -#define LOOKUP_DOWN 0x8000 /* follow mounts in the starting point */ -#define LOOKUP_MOUNTPOINT 0x0080 /* follow mounts in the end */ - -#define LOOKUP_REVAL 0x0020 /* tell ->d_revalidate() to trust no cache */ -#define LOOKUP_RCU 0x0040 /* RCU pathwalk mode; semi-internal */ +#define LOOKUP_FOLLOW BIT(0) /* follow links at the end */ +#define LOOKUP_DIRECTORY BIT(1) /* require a directory */ +#define LOOKUP_AUTOMOUNT BIT(2) /* force terminal automount */ +#define LOOKUP_EMPTY BIT(3) /* accept empty path [user_... only] */ +#define LOOKUP_LINKAT_EMPTY BIT(4) /* Linkat request with empty path. */ +#define LOOKUP_DOWN BIT(5) /* follow mounts in the starting point */ +#define LOOKUP_MOUNTPOINT BIT(6) /* follow mounts in the end */ +#define LOOKUP_REVAL BIT(7) /* tell ->d_revalidate() to trust no cache */ +#define LOOKUP_RCU BIT(8) /* RCU pathwalk mode; semi-internal */ +#define LOOKUP_CACHED BIT(9) /* Only do cached lookup */ +#define LOOKUP_PARENT BIT(10) /* Looking up final parent in path */ +/* 5 spare bits for pathwalk */ /* These tell filesystem methods that we are dealing with the final component... */ -#define LOOKUP_OPEN 0x0100 /* ... in open */ -#define LOOKUP_CREATE 0x0200 /* ... in object creation */ -#define LOOKUP_EXCL 0x0400 /* ... in exclusive creation */ -#define LOOKUP_RENAME_TARGET 0x0800 /* ... in destination of rename() */ +#define LOOKUP_OPEN BIT(16) /* ... in open */ +#define LOOKUP_CREATE BIT(17) /* ... in object creation */ +#define LOOKUP_EXCL BIT(18) /* ... in target must not exist */ +#define LOOKUP_RENAME_TARGET BIT(19) /* ... in destination of rename() */ -/* internal use only */ -#define LOOKUP_PARENT 0x0010 +/* 4 spare bits for intent */ /* Scoping flags for lookup. */ -#define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ -#define LOOKUP_NO_MAGICLINKS 0x020000 /* No nd_jump_link() crossing. */ -#define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */ -#define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */ -#define LOOKUP_IN_ROOT 0x100000 /* Treat dirfd as fs root. */ -#define LOOKUP_CACHED 0x200000 /* Only do cached lookup */ -#define LOOKUP_LINKAT_EMPTY 0x400000 /* Linkat request with empty path. */ +#define LOOKUP_NO_SYMLINKS BIT(24) /* No symlink crossing. */ +#define LOOKUP_NO_MAGICLINKS BIT(25) /* No nd_jump_link() crossing. */ +#define LOOKUP_NO_XDEV BIT(26) /* No mountpoint crossing. */ +#define LOOKUP_BENEATH BIT(27) /* No escaping from starting point. */ +#define LOOKUP_IN_ROOT BIT(28) /* Treat dirfd as fs root. */ /* LOOKUP_* flags which do scope-related checks based on the dirfd. */ #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT) +/* 3 spare bits for scoping */ extern int path_pts(struct path *path); -- GitLab From abb0ea1923a68ec8f45a7615ebad1fc87ea06da6 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:44 -0500 Subject: [PATCH 146/934] iomap: factor out iomap length helper In preparation to support more granular iomap iter advancing, factor the pos/len values as parameters to length calculation. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-2-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 022d7f338c68d..feb748eb62945 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -238,18 +238,33 @@ struct iomap_iter { int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); /** - * iomap_length - length of the current iomap iteration + * iomap_length_trim - trimmed length of the current iomap iteration * @iter: iteration structure + * @pos: File position to trim from. + * @len: Length of the mapping to trim to. * - * Returns the length that the operation applies to for the current iteration. + * Returns a trimmed length that the operation applies to for the current + * iteration. */ -static inline u64 iomap_length(const struct iomap_iter *iter) +static inline u64 iomap_length_trim(const struct iomap_iter *iter, loff_t pos, + u64 len) { u64 end = iter->iomap.offset + iter->iomap.length; if (iter->srcmap.type != IOMAP_HOLE) end = min(end, iter->srcmap.offset + iter->srcmap.length); - return min(iter->len, end - iter->pos); + return min(len, end - pos); +} + +/** + * iomap_length - length of the current iomap iteration + * @iter: iteration structure + * + * Returns the length that the operation applies to for the current iteration. + */ +static inline u64 iomap_length(const struct iomap_iter *iter) +{ + return iomap_length_trim(iter, iter->pos, iter->len); } /** -- GitLab From 2e4b0b6cf5333f3a8e36d211dd81dd1074ff66c2 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:45 -0500 Subject: [PATCH 147/934] iomap: split out iomap check and reset logic from iter advance In preparation for more granular iomap_iter advancing, break out some of the logic associated with higher level iteration from iomap_advance_iter(). Specifically, factor the iomap reset code into a separate helper and lift the iomap.length check into the calling code, similar to how ->iomap_end() calls are handled. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-3-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/iter.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 3790918646af7..731ea7267f270 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -7,6 +7,13 @@ #include #include "trace.h" +static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) +{ + iter->processed = 0; + memset(&iter->iomap, 0, sizeof(iter->iomap)); + memset(&iter->srcmap, 0, sizeof(iter->srcmap)); +} + /* * Advance to the next range we need to map. * @@ -14,32 +21,24 @@ * processed - it was aborted because the extent the iomap spanned may have been * changed during the operation. In this case, the iteration behaviour is to * remap the unprocessed range of the iter, and that means we may need to remap - * even when we've made no progress (i.e. iter->processed = 0). Hence the - * "finished iterating" case needs to distinguish between - * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we - * need to remap the entire remaining range. + * even when we've made no progress (i.e. count = 0). Hence the "finished + * iterating" case needs to distinguish between (count = 0) meaning we are done + * and (count = 0 && stale) meaning we need to remap the entire remaining range. */ -static inline int iomap_iter_advance(struct iomap_iter *iter) +static inline int iomap_iter_advance(struct iomap_iter *iter, s64 count) { bool stale = iter->iomap.flags & IOMAP_F_STALE; int ret = 1; - /* handle the previous iteration (if any) */ - if (iter->iomap.length) { - if (iter->processed < 0) - return iter->processed; - if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) - return -EIO; - iter->pos += iter->processed; - iter->len -= iter->processed; - if (!iter->len || (!iter->processed && !stale)) - ret = 0; - } + if (count < 0) + return count; + if (WARN_ON_ONCE(count > iomap_length(iter))) + return -EIO; + iter->pos += count; + iter->len -= count; + if (!iter->len || (!count && !stale)) + ret = 0; - /* clear the per iteration state */ - iter->processed = 0; - memset(&iter->iomap, 0, sizeof(iter->iomap)); - memset(&iter->srcmap, 0, sizeof(iter->srcmap)); return ret; } @@ -82,10 +81,14 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) return ret; } + /* advance and clear state from the previous iteration */ trace_iomap_iter(iter, ops, _RET_IP_); - ret = iomap_iter_advance(iter); - if (ret <= 0) - return ret; + if (iter->iomap.length) { + ret = iomap_iter_advance(iter, iter->processed); + iomap_iter_reset_iomap(iter); + if (ret <= 0) + return ret; + } ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags, &iter->iomap, &iter->srcmap); -- GitLab From f4799838662340b3e53da8ff6bcabac743d80f17 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:46 -0500 Subject: [PATCH 148/934] iomap: refactor iomap_iter() length check and tracepoint iomap_iter() checks iomap.length to skip individual code blocks not appropriate for the initial case where there is no mapping in the iter. To prepare for upcoming changes, refactor the code to jump straight to the ->iomap_begin() handler in the initial case and move the tracepoint to the top of the function so it always executes. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-4-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/iter.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 731ea7267f270..a2ae99fe64316 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -73,7 +73,12 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) { int ret; - if (iter->iomap.length && ops->iomap_end) { + trace_iomap_iter(iter, ops, _RET_IP_); + + if (!iter->iomap.length) + goto begin; + + if (ops->iomap_end) { ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter), iter->processed > 0 ? iter->processed : 0, iter->flags, &iter->iomap); @@ -82,14 +87,12 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) } /* advance and clear state from the previous iteration */ - trace_iomap_iter(iter, ops, _RET_IP_); - if (iter->iomap.length) { - ret = iomap_iter_advance(iter, iter->processed); - iomap_iter_reset_iomap(iter); - if (ret <= 0) - return ret; - } + ret = iomap_iter_advance(iter, iter->processed); + iomap_iter_reset_iomap(iter); + if (ret <= 0) + return ret; +begin: ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags, &iter->iomap, &iter->srcmap); if (ret < 0) -- GitLab From 9183b2a0e439ffa7ba2e176416efc4ffa21406d8 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:47 -0500 Subject: [PATCH 149/934] iomap: lift error code check out of iomap_iter_advance() The error code is only used to check whether iomap_iter() should terminate due to an error returned in iter.processed. Lift the check out of iomap_iter_advance() in preparation to make it more generic. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-5-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/iter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index a2ae99fe64316..1db16be7b9f00 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -30,8 +30,6 @@ static inline int iomap_iter_advance(struct iomap_iter *iter, s64 count) bool stale = iter->iomap.flags & IOMAP_F_STALE; int ret = 1; - if (count < 0) - return count; if (WARN_ON_ONCE(count > iomap_length(iter))) return -EIO; iter->pos += count; @@ -71,6 +69,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter) */ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) { + s64 processed; int ret; trace_iomap_iter(iter, ops, _RET_IP_); @@ -86,8 +85,14 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) return ret; } + processed = iter->processed; + if (processed < 0) { + iomap_iter_reset_iomap(iter); + return processed; + } + /* advance and clear state from the previous iteration */ - ret = iomap_iter_advance(iter, iter->processed); + ret = iomap_iter_advance(iter, processed); iomap_iter_reset_iomap(iter); if (ret <= 0) return ret; -- GitLab From b26f2ea1cd068b0b902e3bb735d05398d8c05aba Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:48 -0500 Subject: [PATCH 150/934] iomap: lift iter termination logic from iomap_iter_advance() The iter termination logic in iomap_iter_advance() is only needed by iomap_iter() to determine whether to proceed with the next mapping for an ongoing operation. The old logic sets ret to 1 and then terminates if the operation is complete (iter->len == 0) or the previous iteration performed no work and the mapping has not been marked stale. The stale check exists to allow operations to retry the current mapping if an inconsistency has been detected. To further genericize iomap_iter_advance(), lift the termination logic into iomap_iter() and update the former to return success (0) or an error code. iomap_iter() continues on successful advance and non-zero iter->len or otherwise terminates in the no progress (and not stale) or error cases. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-6-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/iter.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 1db16be7b9f00..8e0746ad80bd1 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -27,17 +27,11 @@ static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) */ static inline int iomap_iter_advance(struct iomap_iter *iter, s64 count) { - bool stale = iter->iomap.flags & IOMAP_F_STALE; - int ret = 1; - if (WARN_ON_ONCE(count > iomap_length(iter))) return -EIO; iter->pos += count; iter->len -= count; - if (!iter->len || (!count && !stale)) - ret = 0; - - return ret; + return 0; } static inline void iomap_iter_done(struct iomap_iter *iter) @@ -69,6 +63,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter) */ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) { + bool stale = iter->iomap.flags & IOMAP_F_STALE; s64 processed; int ret; @@ -91,8 +86,18 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) return processed; } - /* advance and clear state from the previous iteration */ + /* + * Advance the iter and clear state from the previous iteration. Use + * iter->len to determine whether to continue onto the next mapping. + * Explicitly terminate in the case where the current iter has not + * advanced at all (i.e. no work was done for some reason) unless the + * mapping has been marked stale and needs to be reprocessed. + */ ret = iomap_iter_advance(iter, processed); + if (!ret && iter->len > 0) + ret = 1; + if (ret > 0 && !iter->processed && !stale) + ret = 0; iomap_iter_reset_iomap(iter); if (ret <= 0) return ret; -- GitLab From b51d30ff51f9c325b65c8cd66ff6590530b14041 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:49 -0500 Subject: [PATCH 151/934] iomap: export iomap_iter_advance() and return remaining length As a final step for generic iter advance, export the helper and update it to return the remaining length of the current iteration after the advance. This will usually be 0 in the iomap_iter() case, but will be useful for the various operations that iterate on their own and will be updated to advance as they progress. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-7-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/iter.c | 22 ++++++++-------------- include/linux/iomap.h | 1 + 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 8e0746ad80bd1..544cd7a5a16bb 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -15,22 +15,16 @@ static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) } /* - * Advance to the next range we need to map. - * - * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully - * processed - it was aborted because the extent the iomap spanned may have been - * changed during the operation. In this case, the iteration behaviour is to - * remap the unprocessed range of the iter, and that means we may need to remap - * even when we've made no progress (i.e. count = 0). Hence the "finished - * iterating" case needs to distinguish between (count = 0) meaning we are done - * and (count = 0 && stale) meaning we need to remap the entire remaining range. + * Advance the current iterator position and output the length remaining for the + * current mapping. */ -static inline int iomap_iter_advance(struct iomap_iter *iter, s64 count) +int iomap_iter_advance(struct iomap_iter *iter, u64 *count) { - if (WARN_ON_ONCE(count > iomap_length(iter))) + if (WARN_ON_ONCE(*count > iomap_length(iter))) return -EIO; - iter->pos += count; - iter->len -= count; + iter->pos += *count; + iter->len -= *count; + *count = iomap_length(iter); return 0; } @@ -93,7 +87,7 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) * advanced at all (i.e. no work was done for some reason) unless the * mapping has been marked stale and needs to be reprocessed. */ - ret = iomap_iter_advance(iter, processed); + ret = iomap_iter_advance(iter, &processed); if (!ret && iter->len > 0) ret = 1; if (ret > 0 && !iter->processed && !stale) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index feb748eb62945..eed06ffdcfbd5 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -236,6 +236,7 @@ struct iomap_iter { }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); +int iomap_iter_advance(struct iomap_iter *iter, u64 *count); /** * iomap_length_trim - trimmed length of the current iomap iteration -- GitLab From bc264fea0f6f230e56f876cc4266b1982d20f35d Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:50 -0500 Subject: [PATCH 152/934] iomap: support incremental iomap_iter advances The current iomap_iter iteration model reads the mapping from the filesystem, processes the subrange of the operation associated with the current mapping, and returns the number of bytes processed back to the iteration code. The latter advances the position and remaining length of the iter in preparation for the next iteration. At the _iter() handler level, this tends to produce a processing loop where the local code pulls the current position and remaining length out of the iter, iterates it locally based on file offset, and then breaks out when the associated range has been fully processed. This works well enough for current handlers, but upcoming enhancements require a bit more flexibility in certain situations. Enhancements for zero range will lead to a situation where the processing loop is no longer a pure ascending offset walk, but rather dictated by pagecache state and folio lookup. Since folio lookup and write preparation occur at different levels, it is more difficult to manage position and length outside of the iter. To provide more flexibility to certain iomap operations, introduce support for incremental iomap_iter advances from within the operation itself. This allows more granular advances for operations that might not use the typical file offset based walk. Note that the semantics for operations that use incremental advances is slightly different than traditional operations. Operations that advance the iter directly are expected to return success or failure (i.e. 0 or negative error code) in iter.processed rather than the number of bytes processed. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-8-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/iter.c | 32 +++++++++++++++++++++++++------- include/linux/iomap.h | 8 ++++++-- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 544cd7a5a16bb..0ebcabc7df523 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -35,6 +35,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter) WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); + iter->iter_start_pos = iter->pos; + trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) trace_iomap_iter_srcmap(iter->inode, &iter->srcmap); @@ -58,6 +60,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter) int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) { bool stale = iter->iomap.flags & IOMAP_F_STALE; + ssize_t advanced = iter->processed > 0 ? iter->processed : 0; + u64 olen = iter->len; s64 processed; int ret; @@ -66,11 +70,22 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) if (!iter->iomap.length) goto begin; + /* + * If iter.processed is zero, the op may still have advanced the iter + * itself. Calculate the advanced and original length bytes based on how + * far pos has advanced for ->iomap_end(). + */ + if (!advanced) { + advanced = iter->pos - iter->iter_start_pos; + olen += advanced; + } + if (ops->iomap_end) { - ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter), - iter->processed > 0 ? iter->processed : 0, - iter->flags, &iter->iomap); - if (ret < 0 && !iter->processed) + ret = ops->iomap_end(iter->inode, iter->iter_start_pos, + iomap_length_trim(iter, iter->iter_start_pos, + olen), + advanced, iter->flags, &iter->iomap); + if (ret < 0 && !advanced) return ret; } @@ -81,8 +96,11 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) } /* - * Advance the iter and clear state from the previous iteration. Use - * iter->len to determine whether to continue onto the next mapping. + * Advance the iter and clear state from the previous iteration. This + * passes iter->processed because that reflects the bytes processed but + * not yet advanced by the iter handler. + * + * Use iter->len to determine whether to continue onto the next mapping. * Explicitly terminate in the case where the current iter has not * advanced at all (i.e. no work was done for some reason) unless the * mapping has been marked stale and needs to be reprocessed. @@ -90,7 +108,7 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) ret = iomap_iter_advance(iter, &processed); if (!ret && iter->len > 0) ret = 1; - if (ret > 0 && !iter->processed && !stale) + if (ret > 0 && !advanced && !stale) ret = 0; iomap_iter_reset_iomap(iter); if (ret <= 0) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eed06ffdcfbd5..e180dacf434c8 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -218,8 +218,11 @@ struct iomap_ops { * calls to iomap_iter(). Treat as read-only in the body. * @len: The remaining length of the file segment we're operating on. * It is updated at the same time as @pos. - * @processed: The number of bytes processed by the body in the most recent - * iteration, or a negative errno. 0 causes the iteration to stop. + * @iter_start_pos: The original start pos for the current iomap. Used for + * incremental iter advance. + * @processed: The number of bytes the most recent iteration needs iomap_iter() + * to advance the iter, zero if the iter was already advanced, or a + * negative errno for an error during the operation. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations @@ -228,6 +231,7 @@ struct iomap_iter { struct inode *inode; loff_t pos; u64 len; + loff_t iter_start_pos; s64 processed; unsigned flags; struct iomap iomap; -- GitLab From 1a1a3b574b979912882f19d655ddac73f5cbc159 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:51 -0500 Subject: [PATCH 153/934] iomap: advance the iter directly on buffered writes Modify the buffered write path to advance the iter directly. Replace the local pos and length calculations with direct advances and loop based on iter state instead. Also remove the -EAGAIN return hack as it is no longer necessary now that separate return channels exist for processing progress and error returns. For example, the existing write handler must return either a count of bytes written or error if the write is interrupted, but presumably wants to return -EAGAIN directly in order to break the higher level iomap_iter() loop. Since the current iteration may have made some progress, it unwinds the iter on the way out to return the error while ensuring that portion of the write can be retried. If -EAGAIN occurs at any point beyond the first iteration, iomap_file_buffered_write() will then observe progress based on iter->pos to return a short write. With incremental advances on the iomap_iter, iomap_write_iter() can simply return the error. iomap_iter() completes whatever progress was made based on iomap_iter position and still breaks out of the iter loop based on the error code in iter.processed. The end result of the write is similar in terms of being a short write if progress was made or error return otherwise. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-9-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 3458f97d1b1e1..93a7e8bdd15be 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -905,8 +905,6 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { - loff_t length = iomap_length(iter); - loff_t pos = iter->pos; ssize_t total_written = 0; long status = 0; struct address_space *mapping = iter->inode->i_mapping; @@ -919,7 +917,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ - size_t written; /* Bytes have been written */ + u64 written; /* Bytes have been written */ + loff_t pos = iter->pos; bytes = iov_iter_count(i); retry: @@ -930,8 +929,8 @@ retry: if (unlikely(status)) break; - if (bytes > length) - bytes = length; + if (bytes > iomap_length(iter)) + bytes = iomap_length(iter); /* * Bring in the user page that we'll copy from _first_. @@ -1002,17 +1001,12 @@ retry: goto retry; } } else { - pos += written; total_written += written; - length -= written; + iomap_iter_advance(iter, &written); } - } while (iov_iter_count(i) && length); + } while (iov_iter_count(i) && iomap_length(iter)); - if (status == -EAGAIN) { - iov_iter_revert(i, total_written); - return -EAGAIN; - } - return total_written ? total_written : status; + return total_written ? 0 : status; } ssize_t -- GitLab From e60837da4d9daa8abadd9fafd9b1941fa1dba037 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:52 -0500 Subject: [PATCH 154/934] iomap: advance the iter directly on unshare range Modify unshare range to advance the iter directly. Replace the local pos and length calculations with direct advances and loop based on iter state instead. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-10-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 93a7e8bdd15be..61e541f0364a9 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1263,20 +1263,19 @@ EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); static loff_t iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); - loff_t written = 0; + u64 bytes = iomap_length(iter); + int status; if (!iomap_want_unshare_iter(iter)) - return length; + return iomap_iter_advance(iter, &bytes); do { struct folio *folio; - int status; size_t offset; - size_t bytes = min_t(u64, SIZE_MAX, length); + loff_t pos = iter->pos; bool ret; + bytes = min_t(u64, SIZE_MAX, bytes); status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; @@ -1294,14 +1293,14 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) cond_resched(); - pos += bytes; - written += bytes; - length -= bytes; - balance_dirty_pages_ratelimited(iter->inode->i_mapping); - } while (length > 0); - return written; + status = iomap_iter_advance(iter, &bytes); + if (status) + break; + } while (bytes > 0); + + return status; } int -- GitLab From cbad829cef3ba7318a2380a0eadc5059770f004a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:53 -0500 Subject: [PATCH 155/934] iomap: advance the iter directly on zero range Modify zero range to advance the iter directly. Replace the local pos and length calculations with direct advances and loop based on iter state instead. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-11-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 61e541f0364a9..8368a4ae716fa 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1341,17 +1341,16 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); - loff_t written = 0; + u64 bytes = iomap_length(iter); + int status; do { struct folio *folio; - int status; size_t offset; - size_t bytes = min_t(u64, SIZE_MAX, length); + loff_t pos = iter->pos; bool ret; + bytes = min_t(u64, SIZE_MAX, bytes); status = iomap_write_begin(iter, pos, bytes, &folio); if (status) return status; @@ -1372,14 +1371,14 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) if (WARN_ON_ONCE(!ret)) return -EIO; - pos += bytes; - length -= bytes; - written += bytes; - } while (length > 0); + status = iomap_iter_advance(iter, &bytes); + if (status) + break; + } while (bytes > 0); if (did_zero) *did_zero = true; - return written; + return status; } int @@ -1433,11 +1432,14 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) { - loff_t proc = iomap_length(&iter); + s64 proc; if (range_dirty) { range_dirty = false; proc = iomap_zero_iter_flush_and_stale(&iter); + } else { + u64 length = iomap_length(&iter); + proc = iomap_iter_advance(&iter, &length); } iter.processed = proc; continue; -- GitLab From 5674609535bafa834ab014d90d9bbe8e89223a0b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 6 Feb 2025 11:16:59 -0800 Subject: [PATCH 156/934] pstore: Change kmsg_bytes storage size to u32 The types around kmsg_bytes were inconsistent. The global was unsigned long, the argument to pstore_set_kmsg_bytes() was int, and the filesystem option was u32. Given other internal limits, there's not much sense in making a single pstore record larger than INT_MAX and it can't be negative, so use u32 everywhere. Additionally, use READ/WRITE_ONCE and a local variable in pstore_dump() to avoid kmsg_bytes changing during a dump. Link: https://lore.kernel.org/r/20250206191655.work.798-kees@kernel.org Signed-off-by: Kees Cook --- fs/pstore/inode.c | 2 +- fs/pstore/internal.h | 4 ++-- fs/pstore/platform.c | 11 ++++++----- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 56815799ce798..9de6b280c4f41 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -265,7 +265,7 @@ static void parse_options(char *options) static int pstore_show_options(struct seq_file *m, struct dentry *root) { if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES) - seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes); + seq_printf(m, ",kmsg_bytes=%u", kmsg_bytes); return 0; } diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 801d6c0b170c3..a0fc511969100 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -6,7 +6,7 @@ #include #include -extern unsigned long kmsg_bytes; +extern unsigned int kmsg_bytes; #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); @@ -35,7 +35,7 @@ static inline void pstore_unregister_pmsg(void) {} extern struct pstore_info *psinfo; -extern void pstore_set_kmsg_bytes(int); +extern void pstore_set_kmsg_bytes(unsigned int bytes); extern void pstore_get_records(int); extern void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f56b066ab80ce..557cf9d40177f 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -92,8 +92,8 @@ module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); /* How much of the kernel log to snapshot */ -unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; -module_param(kmsg_bytes, ulong, 0444); +unsigned int kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; +module_param(kmsg_bytes, uint, 0444); MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)"); static void *compress_workspace; @@ -107,9 +107,9 @@ static void *compress_workspace; static char *big_oops_buf; static size_t max_compressed_size; -void pstore_set_kmsg_bytes(int bytes) +void pstore_set_kmsg_bytes(unsigned int bytes) { - kmsg_bytes = bytes; + WRITE_ONCE(kmsg_bytes, bytes); } /* Tag each group of saved records with a sequence number */ @@ -278,6 +278,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, struct kmsg_dump_detail *detail) { struct kmsg_dump_iter iter; + unsigned int remaining = READ_ONCE(kmsg_bytes); unsigned long total = 0; const char *why; unsigned int part = 1; @@ -300,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, kmsg_dump_rewind(&iter); oopscount++; - while (total < kmsg_bytes) { + while (total < remaining) { char *dst; size_t dst_size; int header_size; -- GitLab From 1a4e0d8682eb66a4fb37158a520eacd37c0b61b7 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 10 Feb 2025 16:52:25 +0800 Subject: [PATCH 157/934] sched_ext: Take NUMA node into account when allocating per-CPU cpumasks per-CPU cpumasks are dominantly accessed from their own local CPUs, so allocate them node-local to improve performance. Signed-off-by: Li RongQing Acked-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index b2378e29f45aa..90a66a2a43d5c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5998,15 +5998,16 @@ void __init init_sched_ext_class(void) for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); + int n = cpu_to_node(cpu); init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); -- GitLab From 2e7df12bdde13c0efefc3230dc96b36edfa53ec5 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Mon, 10 Feb 2025 23:54:30 +0900 Subject: [PATCH 158/934] tools/sched_ext: Update enum_defs.autogen.h Add where the script is located to the comment lines of the header file. This helps anyone re-generate the header file if required. Note that this is a sync from the PR [1] in the scx repo. [1] https://github.com/sched-ext/scx/pull/1322 Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/enum_defs.autogen.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h index 5551816856ee7..6e6c45f14fe1f 100644 --- a/tools/sched_ext/include/scx/enum_defs.autogen.h +++ b/tools/sched_ext/include/scx/enum_defs.autogen.h @@ -1,11 +1,12 @@ /* - * WARNING: This file is autogenerated from scripts/gen_enum_defs.py. + * WARNING: This file is autogenerated from gen_enum_defs.py [1]. + * + * [1] https://github.com/sched-ext/scx/blob/main/scripts/gen_enum_defs.py */ #ifndef __ENUM_DEFS_AUTOGEN_H__ #define __ENUM_DEFS_AUTOGEN_H__ - #define HAVE_SCX_DSP_DFL_MAX_BATCH #define HAVE_SCX_DSP_MAX_LOOPS #define HAVE_SCX_WATCHDOG_MAX_TIMEOUT -- GitLab From 18f7686a1ce6b72e0ac61aa16f167eb931185cf1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 10 Feb 2025 09:22:50 -0800 Subject: [PATCH 159/934] selftests/seccomp: Add hard-coded __NR_uretprobe for x86_64 Since headers don't always follow the selftests around correct, explicitly include the __NR_uretprobe syscall for better test coverage. Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 14ba51b52095a..b2f76a52215ad 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -155,6 +155,12 @@ struct seccomp_data { # endif #endif +#ifndef __NR_uretprobe +# if defined(__x86_64__) +# define __NR_uretprobe 335 +# endif +#endif + #ifndef SECCOMP_SET_MODE_STRICT #define SECCOMP_SET_MODE_STRICT 0 #endif -- GitLab From 0fe1ebf3f056d99ef4835dc5d88b9c3bb12e44c1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Jan 2025 16:03:00 +0100 Subject: [PATCH 160/934] seccomp/mips: change syscall_trace_enter() to use secure_computing() arch/mips/Kconfig selects HAVE_ARCH_SECCOMP_FILTER so syscall_trace_enter() can just use __secure_computing(NULL) and rely on populate_seccomp_data(sd) and "sd == NULL" checks in __secure_computing(sd) paths. With the change above syscall_trace_enter() can just use secure_computing() and avoid #ifdef + test_thread_flag(TIF_SECCOMP). CONFIG_GENERIC_ENTRY is not defined, so test_syscall_work(SECCOMP) will check TIF_SECCOMP. Signed-off-by: Oleg Nesterov Acked-by: Thomas Bogendoerfer Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250128150300.GA15318@redhat.com Signed-off-by: Kees Cook --- arch/mips/kernel/ptrace.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 61503a36067e9..f7107479c7fa9 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -1326,24 +1326,8 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs) return -1; } -#ifdef CONFIG_SECCOMP - if (unlikely(test_thread_flag(TIF_SECCOMP))) { - int ret, i; - struct seccomp_data sd; - unsigned long args[6]; - - sd.nr = current_thread_info()->syscall; - sd.arch = syscall_get_arch(current); - syscall_get_arguments(current, regs, args); - for (i = 0; i < 6; i++) - sd.args[i] = args[i]; - sd.instruction_pointer = KSTK_EIP(current); - - ret = __secure_computing(&sd); - if (ret == -1) - return ret; - } -#endif + if (secure_computing()) + return -1; if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->regs[2]); -- GitLab From b37778bec82ba82058912ca069881397197cd3d5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Jan 2025 16:03:07 +0100 Subject: [PATCH 161/934] seccomp: fix the __secure_computing() stub for !HAVE_ARCH_SECCOMP_FILTER Depending on CONFIG_HAVE_ARCH_SECCOMP_FILTER, __secure_computing(NULL) will crash or not. This is not consistent/safe, especially considering that after the previous change __secure_computing(sd) is always called with sd == NULL. Fortunately, if CONFIG_HAVE_ARCH_SECCOMP_FILTER=n, __secure_computing() has no callers, these architectures use secure_computing_strict(). Yet it make sense make __secure_computing(NULL) safe in this case. Note also that with this change we can unexport secure_computing_strict() and change the current callers to use __secure_computing(NULL). Fixes: 8cf8dfceebda ("seccomp: Stub for !HAVE_ARCH_SECCOMP_FILTER") Signed-off-by: Oleg Nesterov Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250128150307.GA15325@redhat.com Signed-off-by: Kees Cook --- include/linux/seccomp.h | 8 ++------ kernel/seccomp.c | 14 ++++++++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index e45531455d3bb..d55949071c30e 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -22,8 +22,9 @@ #include #include -#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER extern int __secure_computing(const struct seccomp_data *sd); + +#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER static inline int secure_computing(void) { if (unlikely(test_syscall_work(SECCOMP))) @@ -32,11 +33,6 @@ static inline int secure_computing(void) } #else extern void secure_computing_strict(int this_syscall); -static inline int __secure_computing(const struct seccomp_data *sd) -{ - secure_computing_strict(sd->nr); - return 0; -} #endif extern long prctl_get_seccomp(void); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7bbb408431ebc..3231f63d93d89 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -29,13 +29,11 @@ #include #include +#include + /* Not exposed in headers: strictly internal use only. */ #define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1) -#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER -#include -#endif - #ifdef CONFIG_SECCOMP_FILTER #include #include @@ -1074,6 +1072,14 @@ void secure_computing_strict(int this_syscall) else BUG(); } +int __secure_computing(const struct seccomp_data *sd) +{ + int this_syscall = sd ? sd->nr : + syscall_get_nr(current, current_pt_regs()); + + secure_computing_strict(this_syscall); + return 0; +} #else #ifdef CONFIG_SECCOMP_FILTER -- GitLab From 1027cd8084bbcdf80d8a096d5e2c6da91402fc3c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Jan 2025 16:03:13 +0100 Subject: [PATCH 162/934] seccomp: remove the 'sd' argument from __secure_computing() After the previous changes 'sd' is always NULL. Signed-off-by: Oleg Nesterov Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250128150313.GA15336@redhat.com Signed-off-by: Kees Cook --- arch/powerpc/kernel/ptrace/ptrace.c | 2 +- include/linux/seccomp.h | 6 +++--- kernel/entry/common.c | 2 +- kernel/seccomp.c | 12 +++++------- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index 727ed4a145451..c6997df632873 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -215,7 +215,7 @@ static int do_seccomp(struct pt_regs *regs) * have already loaded -ENOSYS into r3, or seccomp has put * something else in r3 (via SECCOMP_RET_ERRNO/TRACE). */ - if (__secure_computing(NULL)) + if (__secure_computing()) return -1; /* diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index d55949071c30e..9b959972bf4a2 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -22,13 +22,13 @@ #include #include -extern int __secure_computing(const struct seccomp_data *sd); +extern int __secure_computing(void); #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER static inline int secure_computing(void) { if (unlikely(test_syscall_work(SECCOMP))) - return __secure_computing(NULL); + return __secure_computing(); return 0; } #else @@ -54,7 +54,7 @@ static inline int secure_computing(void) { return 0; } #else static inline void secure_computing_strict(int this_syscall) { return; } #endif -static inline int __secure_computing(const struct seccomp_data *sd) { return 0; } +static inline int __secure_computing(void) { return 0; } static inline long prctl_get_seccomp(void) { diff --git a/kernel/entry/common.c b/kernel/entry/common.c index e33691d5adf7a..20154572ede94 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -49,7 +49,7 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Do seccomp after ptrace, to catch any tracer changes. */ if (work & SYSCALL_WORK_SECCOMP) { - ret = __secure_computing(NULL); + ret = __secure_computing(); if (ret == -1L) return ret; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 3231f63d93d89..e90cbdf351667 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1072,10 +1072,9 @@ void secure_computing_strict(int this_syscall) else BUG(); } -int __secure_computing(const struct seccomp_data *sd) +int __secure_computing(void) { - int this_syscall = sd ? sd->nr : - syscall_get_nr(current, current_pt_regs()); + int this_syscall = syscall_get_nr(current, current_pt_regs()); secure_computing_strict(this_syscall); return 0; @@ -1365,7 +1364,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, } #endif -int __secure_computing(const struct seccomp_data *sd) +int __secure_computing(void) { int mode = current->seccomp.mode; int this_syscall; @@ -1374,15 +1373,14 @@ int __secure_computing(const struct seccomp_data *sd) unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return 0; - this_syscall = sd ? sd->nr : - syscall_get_nr(current, current_pt_regs()); + this_syscall = syscall_get_nr(current, current_pt_regs()); switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ return 0; case SECCOMP_MODE_FILTER: - return __seccomp_filter(this_syscall, sd, false); + return __seccomp_filter(this_syscall, NULL, false); /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */ case SECCOMP_MODE_DEAD: WARN_ON_ONCE(1); -- GitLab From e1cec5107c394911c32ddd907e89d77249c48559 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Jan 2025 16:03:21 +0100 Subject: [PATCH 163/934] seccomp: remove the 'sd' argument from __seccomp_filter() After the previous change 'sd' is always NULL. Signed-off-by: Oleg Nesterov Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250128150321.GA15343@redhat.com Signed-off-by: Kees Cook --- kernel/seccomp.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e90cbdf351667..0ce17c6161506 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1230,13 +1230,12 @@ out: return -1; } -static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, - const bool recheck_after_trace) +static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) { u32 filter_ret, action; + struct seccomp_data sd; struct seccomp_filter *match = NULL; int data; - struct seccomp_data sd_local; /* * Make sure that any changes to mode from another thread have @@ -1244,12 +1243,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ smp_rmb(); - if (!sd) { - populate_seccomp_data(&sd_local); - sd = &sd_local; - } + populate_seccomp_data(&sd); - filter_ret = seccomp_run_filters(sd, &match); + filter_ret = seccomp_run_filters(&sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION_FULL; @@ -1307,13 +1303,13 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, * a reload of all registers. This does not goto skip since * a skip would have already been reported. */ - if (__seccomp_filter(this_syscall, NULL, true)) + if (__seccomp_filter(this_syscall, true)) return -1; return 0; case SECCOMP_RET_USER_NOTIF: - if (seccomp_do_user_notification(this_syscall, match, sd)) + if (seccomp_do_user_notification(this_syscall, match, &sd)) goto skip; return 0; @@ -1355,8 +1351,7 @@ skip: return -1; } #else -static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, - const bool recheck_after_trace) +static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) { BUG(); @@ -1380,7 +1375,7 @@ int __secure_computing(void) __secure_computing_strict(this_syscall); /* may call do_exit */ return 0; case SECCOMP_MODE_FILTER: - return __seccomp_filter(this_syscall, NULL, false); + return __seccomp_filter(this_syscall, false); /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */ case SECCOMP_MODE_DEAD: WARN_ON_ONCE(1); -- GitLab From 7038f9f2e86d506c6a43079864e0112045202a8f Mon Sep 17 00:00:00 2001 From: Ritvik Gupta Date: Mon, 10 Feb 2025 04:39:37 +0000 Subject: [PATCH 164/934] documentation/filesystems: fix spelling mistakes Corrected the following spelling mistakes, based on the suggestions by codespell: 1. Optionaly -> Optionally 2. prefereable -> preferable 3. peformance -> performance 4. ontext -> context 5. failuer -> failure 6. poiners -> pointers 7. realtively -> relatively 8. uptream -> upstream Signed-off-by: Ritvik Gupta Acked-by: Randy Dunlap Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20250210043937.30952-1-ritvikfoss@gmail.com --- Documentation/filesystems/9p.rst | 2 +- Documentation/filesystems/bcachefs/SubmittingPatches.rst | 4 ++-- Documentation/filesystems/coda.rst | 2 +- Documentation/filesystems/debugfs.rst | 2 +- Documentation/filesystems/netfs_library.rst | 2 +- Documentation/filesystems/xfs/xfs-delayed-logging-design.rst | 2 +- .../filesystems/xfs/xfs-maintainer-entry-profile.rst | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/9p.rst b/Documentation/filesystems/9p.rst index 2bbf68b56b0d7..3078f3c9256a8 100644 --- a/Documentation/filesystems/9p.rst +++ b/Documentation/filesystems/9p.rst @@ -90,7 +90,7 @@ Just start the 9pfs capable network server like diod/nfs-ganesha e.g.:: $ diod -f -n -d 0 -S -l 0.0.0.0:9999 -e $PWD -Optionaly scan your bus if there are more then one usbg gadgets to find their path:: +Optionally scan your bus if there are more then one usbg gadgets to find their path:: $ python $kernel_dir/tools/usb/p9_fwd.py list diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst index 026b12ae0d6a2..4b79ca58faf27 100644 --- a/Documentation/filesystems/bcachefs/SubmittingPatches.rst +++ b/Documentation/filesystems/bcachefs/SubmittingPatches.rst @@ -30,7 +30,7 @@ CI: === Instead of running your tests locally, when running the full test suite it's -prefereable to let a server farm do it in parallel, and then have the results +preferable to let a server farm do it in parallel, and then have the results in a nice test dashboard (which can tell you which failures are new, and presents results in a git log view, avoiding the need for most bisecting). @@ -68,7 +68,7 @@ Other things to think about: land - use them. Use them judiciously, and not as a replacement for proper error handling, but use them. -- Does it need to be performance tested? Should we add new peformance counters? +- Does it need to be performance tested? Should we add new performance counters? bcachefs has a set of persistent runtime counters which can be viewed with the 'bcachefs fs top' command; this should give users a basic idea of what diff --git a/Documentation/filesystems/coda.rst b/Documentation/filesystems/coda.rst index bdde7e4e010b9..0db3c83a50e55 100644 --- a/Documentation/filesystems/coda.rst +++ b/Documentation/filesystems/coda.rst @@ -141,7 +141,7 @@ kernel support. a process P which accessing a Coda file. It makes a system call which traps to the OS kernel. Examples of such calls trapping to the kernel are ``read``, ``write``, ``open``, ``close``, ``create``, ``mkdir``, - ``rmdir``, ``chmod`` in a Unix ontext. Similar calls exist in the Win32 + ``rmdir``, ``chmod`` in a Unix context. Similar calls exist in the Win32 environment, and are named ``CreateFile``. Generally the operating system handles the request in a virtual diff --git a/Documentation/filesystems/debugfs.rst b/Documentation/filesystems/debugfs.rst index f7f977ffbf8d4..610f718ef8b54 100644 --- a/Documentation/filesystems/debugfs.rst +++ b/Documentation/filesystems/debugfs.rst @@ -220,7 +220,7 @@ There are a couple of other directory-oriented helper functions:: A call to debugfs_change_name() will give a new name to an existing debugfs file, always in the same directory. The new_name must not exist prior -to the call; the return value is 0 on success and -E... on failuer. +to the call; the return value is 0 on success and -E... on failure. Symbolic links can be created with debugfs_create_symlink(). There is one important thing that all debugfs users must take into account: diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index 73f0bfd7e903a..3886c14f89f45 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -515,7 +515,7 @@ The methods defined in the table are: the cache to expand a request in either direction. This allows the cache to size the request appropriately for the cache granularity. - The function is passed poiners to the start and length in its parameters, + The function is passed pointers to the start and length in its parameters, plus the size of the file for reference, and adjusts the start and length appropriately. It should return one of: diff --git a/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst b/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst index 6402ab8e370c8..2a2705e975e84 100644 --- a/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst +++ b/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst @@ -219,7 +219,7 @@ The log is circular, so the positions in the log are defined by the combination of a cycle number - the number of times the log has been overwritten - and the offset into the log. A LSN carries the cycle in the upper 32 bits and the offset in the lower 32 bits. The offset is in units of "basic blocks" (512 -bytes). Hence we can do realtively simple LSN based math to keep track of +bytes). Hence we can do relatively simple LSN based math to keep track of available space in the log. Log space accounting is done via a pair of constructs called "grant heads". The diff --git a/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst b/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst index 32b6ac4ca9d61..ce4584fb3103c 100644 --- a/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst +++ b/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst @@ -93,7 +93,7 @@ others on a regular basis about burnout. sponsoring work on any part of XFS. - **LTS Maintainer**: Someone who backports and tests bug fixes from - uptream to the LTS kernels. + upstream to the LTS kernels. There tend to be six separate LTS trees at any given time. The maintainer for a given LTS release should identify themselves with an -- GitLab From 07ab93f3cc88d76250b7c3659ce9c5f7a013d7ce Mon Sep 17 00:00:00 2001 From: Charles Han Date: Fri, 7 Feb 2025 15:34:29 +0800 Subject: [PATCH 165/934] Documentation: Remove repeated word in docs Remove the repeated word "to" docs. Signed-off-by: Charles Han Acked-by: Marc Kleine-Budde Acked-by: Vincent Mailhol Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20250207073433.23604-1-hanchunchao@inspur.com --- .../devicetree/bindings/net/can/microchip,mcp251xfd.yaml | 2 +- Documentation/filesystems/xfs/xfs-online-fsck-design.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/net/can/microchip,mcp251xfd.yaml b/Documentation/devicetree/bindings/net/can/microchip,mcp251xfd.yaml index 2a98b26630cb1..c155c9c6db392 100644 --- a/Documentation/devicetree/bindings/net/can/microchip,mcp251xfd.yaml +++ b/Documentation/devicetree/bindings/net/can/microchip,mcp251xfd.yaml @@ -40,7 +40,7 @@ properties: microchip,rx-int-gpios: description: - GPIO phandle of GPIO connected to to INT1 pin of the MCP251XFD, which + GPIO phandle of GPIO connected to INT1 pin of the MCP251XFD, which signals a pending RX interrupt. maxItems: 1 diff --git a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst index 12aa638408304..e231d127cd405 100644 --- a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst +++ b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst @@ -4521,8 +4521,8 @@ Both online and offline repair can use this strategy. | For this second effort, the ondisk parent pointer format as originally | | proposed was ``(parent_inum, parent_gen, dirent_pos) → (dirent_name)``. | | The format was changed during development to eliminate the requirement | -| of repair tools needing to to ensure that the ``dirent_pos`` field | -| always matched when reconstructing a directory. | +| of repair tools needing to ensure that the ``dirent_pos`` field always | +| matched when reconstructing a directory. | | | | There were a few other ways to have solved that problem: | | | -- GitLab From 2783096fb1ddd3d5987a68efe8d52b7afccdda04 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Thu, 30 Jan 2025 16:28:09 +0900 Subject: [PATCH 166/934] docs: submit-checklist: Expand on build tests against different word sizes Existing sentence on cross-compilation that mentions ppc64 does not make much sense in today's perspective. Expand it for the benefits of testing against architectures of different word sizes and endianness. Signed-off-by: Akira Yokosawa Reviewed-by: Randy Dunlap Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/05c0b99c-c2e9-4702-90fd-8a4127586424@gmail.com --- Documentation/process/submit-checklist.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst index e531dd504b6c2..88b6358258d76 100644 --- a/Documentation/process/submit-checklist.rst +++ b/Documentation/process/submit-checklist.rst @@ -91,9 +91,12 @@ Build your code fix any issues. 2) Builds on multiple CPU architectures by using local cross-compile tools - or some other build farm. Note that ppc64 is a good architecture for - cross-compilation checking because it tends to use ``unsigned long`` for - 64-bit quantities. + or some other build farm. + Note that testing against architectures of different word sizes + (32- and 64-bit) and different endianness (big- and little-) is effective + in catching various portability issues due to false assumptions on + representable quantity range, data alignment, or endianness, among + others. 3) Newly-added code has been compiled with ``gcc -W`` (use ``make KCFLAGS=-W``). This will generate lots of noise, but is good -- GitLab From 8bc237a1320c8d35a936562cdd3551dd0c41f3d2 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Fri, 7 Feb 2025 16:48:20 +0800 Subject: [PATCH 167/934] docs/zh_CN: Update the translation of dev-tools/ubsan to v6.14-rc1 Commit 918327e9b7ff ("ubsan: Remove CONFIG_UBSAN_SANITIZE_ALL") removed the CONFIG_UBSAN_SANITIZE_ALL configuration option. Update the Chinese documentation accordingly and revise the document format by the way. Link: https://lore.kernel.org/all/6F05157E5E157493+20250123043258.149643-1-wangyuli@uniontech.com/ Link: https://lore.kernel.org/all/fb3c5ec4-eabc-48c0-bf0b-d20cad978b4f@linux.dev/ Signed-off-by: WangYuli Reviewed-by: Yanteng Si Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/2B33A00C9F5BECC7+20250207084821.251531-1-wangyuli@uniontech.com --- .../translations/zh_CN/dev-tools/ubsan.rst | 33 +++++++++---------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/Documentation/translations/zh_CN/dev-tools/ubsan.rst b/Documentation/translations/zh_CN/dev-tools/ubsan.rst index 2487696b37728..81ef6f77caeb6 100644 --- a/Documentation/translations/zh_CN/dev-tools/ubsan.rst +++ b/Documentation/translations/zh_CN/dev-tools/ubsan.rst @@ -3,7 +3,14 @@ .. include:: ../disclaimer-zh_CN.rst :Original: Documentation/dev-tools/ubsan.rst -:Translator: Dongliang Mu + +:翻译: + + 慕冬亮 Dongliang Mu + +:校译: + + 王昱力 WangYuli 未定义行为消毒剂 - UBSAN ==================================== @@ -55,30 +62,20 @@ GCC自4.9.x [1_] (详见 ``-fsanitize=undefined`` 选项及其子选项)版 使用如下内核配置启用UBSAN:: - CONFIG_UBSAN=y - -使用如下内核配置检查整个内核:: - - CONFIG_UBSAN_SANITIZE_ALL=y - -为了在特定文件或目录启动代码插桩,需要在相应的内核Makefile中添加一行类似内容: + CONFIG_UBSAN=y -- 单文件(如main.o):: - - UBSAN_SANITIZE_main.o := y - -- 一个目录中的所有文件:: - - UBSAN_SANITIZE := y - -即使设置了``CONFIG_UBSAN_SANITIZE_ALL=y``,为了避免文件被插桩,可使用:: +排除要被检测的文件:: UBSAN_SANITIZE_main.o := n -与:: +排除一个目录中的所有文件:: UBSAN_SANITIZE := n +当全部文件都被禁用,可通过如下方式为特定文件启用:: + + UBSAN_SANITIZE_main.o := y + 未对齐的内存访问检测可通过开启独立选项 - CONFIG_UBSAN_ALIGNMENT 检测。 该选项在支持未对齐访问的架构上(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y) 默认为关闭。该选项仍可通过内核配置启用,但它将产生大量的UBSAN报告。 -- GitLab From 24b330444886957ee8b4960b5f4c8e49909f15b1 Mon Sep 17 00:00:00 2001 From: zhangwei Date: Fri, 7 Feb 2025 18:18:55 +0800 Subject: [PATCH 168/934] docs/zh_CN: Add self-protection index Chinese translation Translate .../security/self-protection.rst into Chinese. Update the translation through commit b080e52110ea ("docs: update self-protection __ro_after_init status") Reviewed-by: Yanteng Si Reviewed-by: Yuxian Mao Signed-off-by: zhangwei Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/dd8c6da2e145aadac202e979bea0ff6b56431e0b.1738923258.git.zhangwei@cqsoftware.com.cn --- .../translations/zh_CN/security/index.rst | 2 +- .../zh_CN/security/self-protection.rst | 271 ++++++++++++++++++ 2 files changed, 272 insertions(+), 1 deletion(-) create mode 100644 Documentation/translations/zh_CN/security/self-protection.rst diff --git a/Documentation/translations/zh_CN/security/index.rst b/Documentation/translations/zh_CN/security/index.rst index 0aacecabf0c0e..9a6345dd90b8e 100644 --- a/Documentation/translations/zh_CN/security/index.rst +++ b/Documentation/translations/zh_CN/security/index.rst @@ -18,6 +18,7 @@ credentials lsm sak + self-protection siphash tpm/index digsig @@ -29,6 +30,5 @@ TODOLIST: * keys/index * lsm-development * SCTP -* self-protection * secrets/index * ipe diff --git a/Documentation/translations/zh_CN/security/self-protection.rst b/Documentation/translations/zh_CN/security/self-protection.rst new file mode 100644 index 0000000000000..3c8a68b1e1be9 --- /dev/null +++ b/Documentation/translations/zh_CN/security/self-protection.rst @@ -0,0 +1,271 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../disclaimer-zh_CN.rst +:Original: Documentation/security/self-protection.rst + +:翻译: + + 张巍 zhangwei + +============ +内核自我保护 +============ + +内核自我保护是指在Linux内核中设计与实现的各种系统与结构 +以防止内核本身的安全漏洞问题。它涵盖了广泛问题,包括去除 +整个类的漏洞,阻止安全漏洞利用方法,以及主动检测攻击尝 +试。并非所有的话题都在本文中涉及,但它应该为了解内核自我 +保护提供一个合理的起点,并解答常见的问题。(当然,欢迎提 +交补丁!) + +在最坏的情况下,我们假设一个非特权的本地攻击者对内核内存 +有任意读写访问权限。虽然在许多情况下,漏洞被利用时并不会 +提供此级别的访问权限,但如果我们能防御最坏情况,也能应对 +权限较低的攻击。一个更高的标准,且需要牢记的是保护内核免 +受具有特权的本地攻击者的攻击,因为root用户可以有更多权限。 +(尤其是当他们能够加载任意内核模块时) + +成功的自我保护的目标是:有效、默认开启、不需要开发者主动 +选择、没有性能影响、不妨碍内核调试、并且没有测试。虽然很 +难满足所有的这些目标,但明确提到这些目标非常重要,因为这 +些方面需要被探索、解决或接受。 + +========== +攻击面缩减 +========== + +防止安全漏洞最基本的防御方式是减少可以被用来重定向执行的 +内核区域。这包括限制用户公开使用的API、使内核API更难被错 +误使用、最小化可写内核内存区域等。 + +严格的内核内存权限 +------------------- + +当所有内核内存都是可写的,攻击者可以轻松地重定向执行流。 +为了减少这种攻击目标的可用性,内核需要更严格的权限集来 +保护其内存。 + +可执行代码和只读数据必须不可写 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +任何具有可执行内存的区域必须不可写,显然这也包括内核文本 +本身。我们还必须考虑其他地方:内核模块、JIT内存等,(在 +某些情况下,为了支持像指令替代、断点、kprobes等功能,这些 +区域会暂时被设置为可写。如果这些功能必须存在于内核中,它 +们的实现方式是:在更新期间将内存临时设置可写,然后再恢复 +为原始权限。) + +为了支持这一点,CONFIG_STRICT_KERNEL_RWX 和 +CONFIG_STRICT_MODULE_RWX 的设计旨在确保代码不可写,数据不 +可执行,以及只读数据既不可写也不可执行。 + +大多数架构默认支持这些选项,且用户无法选择。对于一些像arm +这种希望能够选择这些选项的架构,可以在架构Kconfig中选择 +ARCH_OPTIONAL_KERNEL_RWX以启用Kconfig提示。 +CONFIG_ARCH_OPTIONAL_KERNEL_RWX_DEFAULT决定在启用 +ARCH_OPTIONAL_KERNEL_RWX时的默认设置。 + +函数指针和敏感变量必须不可写 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +内核内存中有大量的函数指针,这些指针被内核查找并用于继续执行 +(例如,描述符/向量表、文件/网络等操作结构等)。这些变量的数 +量必须减少到最低限度 + +许多像这样的变量可以通过设置为"const"来实现只读,从而使它们 +存放在内核的.rodata段而非.data段,从而获得内核严格内存权限的 +保护。 + +对于在_init是仅初始化一次的变量,可以使用_ro_after_init属性 +进行标记。 + +剩下的变量通常是那些更新频率较低的(例如GDT)。这些变量需要另 +一个机制(类似于上述提到的对内核代码所做的临时例外),以便在 +其余生命周期内保持只读状态。(例如,在进行更新时,只有执行 +更新的CPU线程会被授予对内存的不可中断写入访问权限。) + +将内核内存与用户空间内存分隔开 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +内核绝对不可以执行用户空间内存,同时,内核也不得在没有明确预 +期的情况下访问用户内存空间。这些规则可以通过一些硬件限制来支 +持(如x86的SMEP/SMAP,ARM的PXN/PAN)或通过仿真(如ARM的内存 +域)来强制执行。通过这种方式阻止用户空间内存的访问,攻击者就 +无法将执行和数据解析转移到易于控制的用户空间内存,从而迫使攻 +击完全在内核中进行。 + +减少对系统调用的访问 +-------------------- + +对于64位系统,一种消除许多系统调用最简单的方法是构建时不启用 +CONFIG_CONPAT。然而,这种情况通常不可行。 + +“seccomp”系统为用户空间提供了一种可选功能,提供了一种减少可供 +运行中进程使用内核入口点数量的方法。这限制了可以访问内核代码 +的范围,可能降低了某个特定漏洞被攻击者利用的可能性。 + +一个改进的方向是创建有效的方法,仅允许受信任的进程访问例如兼 +容模式、用户命名空间、BPF创建和性能分析等功能。这将把内核入口 +点范围限制在通常可以被非特权用户空间进程访问的较常见集合中 + +限制对内核模块的访问 +-------------------- + +内核绝不应允许非特权用户加载特定的内核模块,因为这可能为攻击者 +提供一个意外扩展的可用攻击面的方法。(通过已预定义子系统按需加 +载模块,如MODULE_ALIAS_*,被认为是“预期的”,但即便如此,也应对 +这些情况给予更多的关注。)例如,通过非特权的套接字API加载文件 +系统模块是没有意义的:只有root用户或物理本地用户应该触发文件系 +统模块的加载。(在某些情况下,这甚至可能存在争议。) + +为了防止特权用户的攻击,系统可能需要完全禁止模块加载(例如,通 +过单体内核构建或modules_disabled sysctl),或者使用签名模块(例 +如,CONFIG_MODULE_SIG_FORCE或通过LoadPin保护的dm-crypt),以防 +止root用户通过模块加载器加载任意内核代码。 + +内存完整性 +---------- + +内核中有许多内存结构在攻击过程中被定期泛滥用以获取执行控制,迄今 +为止,最常见的是堆栈缓冲区溢出,在这种攻击中,堆栈上存储的返回地 +址被覆盖。除此之外,还有许多其他类型的攻击,防护措施也应运而生。 + +堆栈缓冲区溢出 +-------------- + +经典的堆栈缓冲区溢出攻击是指超出栈上分配的变量预期大小,从而将一 +个受控值写入栈帧的返回地址。最常见的防御措施是堆栈保护 +(CONFIG_STACKPROTECTOR),它在函数返回前会验证栈上的“stack canary”。 +其他防御措施还包括影子堆栈等。 + +堆栈深度溢出 +------------ + +一个不太容易被理解的攻击方式是利用bug触发内核通过深度函数调用或 +大的堆栈分配来消耗堆栈内存。通过这种攻击,攻击者可以将数据写入内 +核预分配堆栈空间之外的敏感结构。为了更好的防护这种攻击,必须进行 +两项重要的更改:将敏感的线程信息结构转移到其他地方,并在堆栈底部 +添加一个故障内存洞,以捕获这些溢出 + +栈内存完整性 +------------ + +用于跟踪堆空闲列表的结构可以在分配和释放时进行完整性检查,以确保它 +们不会被用来操作其它内存区域。 + +计算器完整性 +------------ + +内核中的许多地方使用原子计数器来跟踪对象引用或执行类似的生命周期管 +理。当这些计数器可能发生溢出时(无论是上溢还是下溢),这通常会暴露 +出使用后释放(use-after-free)漏洞。通过捕捉原子计数器溢出,这类漏 +洞就可以消失。 + +大小计算溢出检测 +---------------- + +与计算器溢出类似,整数溢出(通常是大小计算)需要在运行时进行检测, +以防止这类在传统上会导致能够写入内核缓冲区末尾之外的漏洞。 + +概率性防御 +---------- + +尽管许多防御措施可以被认定是确定的(例如,只读内存不能写入),但 +有些确保措施仅提供统计防御,即攻击者必须收集足够的关于运行系统的 +信息才能突破防御。尽管这些防御并不完美,但它们确实提供了有意义的 +保护。 + +栈保护、迷惑技术和其他秘密 +-------------------------- + +值得注意的是,像之前讨论的栈保护这样的技术,从技术上来说是统计性防 +御,因为它们依赖于一个秘密值,而这样的值可能会通过信息泄露漏洞而被 +发现。 + +对于想JIT(及时翻译器)这样的情况,其中可执行内容可能部分由用户空间 +控制,也需要类似的秘密之来迷惑。 + +至关重要的是,所使用的秘密值必须是独立的(例如,每个栈使用不同的栈 +保护值),并且具有高熵(例如,随机数生成器(RNG)是否正常工作?), +以最大限度地提高其成功率。 + +内核地址空间布局随机化(KASLR) +------------------------------- + +由于内核内存的位置几乎总是攻击成功的关键因素,因此使内核内存位置变 +得非确定性会增加攻击的难度。(请注意,这反过来提高了信息泄露的价 +值,因为泄露的信息可以用来发现目标内存位置。) + +文本和模块基址 +-------------- + +通过在启动时重新设定内核的物理基地址和虚拟基地址 +(CONFIG_RANDOMIZE_BASE),那些需要利用内核代码的攻击将会受阻。此外 +通过偏移模块加载基地址,意味着即使系统每次启动时按相同顺序加载同一 +组模块,这些模块也不会与内核文本的其余部分公用一个基地址。 + +堆栈基地址 +---------- + +如果进程之间内核堆栈的基地址不相同,甚至在不同系统调用之间也不相同, +那么栈上或超出栈的目标位置就会变得更加难以确定。 + +动态内存基址 +------------ + +很多内核的动态内存(例如kmalloc,vmalloc等)由于早期启动初始化的顺 +序,最终布局是相对确定的。如果这些区域的基地址在启动之间不相同,攻 +击者就无法轻易定位它们,必须依赖于针对该区域的信息泄露才能成功。 + +结构布局 +-------- + +通过在每次构建时对敏感结构的布局进行随机化处理,攻击这必须将攻击调 +节到已知的内核版本,或者泄露足够的内核内存来确定结构布局,然后才能 +对其进行操作。 + +防止信息泄露 +------------ + +由于敏感结构的位置是攻击的主要目标,因此防止内核内存地址和内核内存 +内容泄露非常重要(因为它们可能包含内核地址或者其他敏感数据,例如 +栈保护值)。 + +内核地址 +-------- + +将内核地址打印到用户空间会泄露有关内核内存布局的敏感信息。在使用任 +何打印符号打印原始地址时,目前%px,%p[ad](和在某些情况下的%p[sSb]) +时。使用这些格式符写入的文件需要限制为只有特权进程可读。 + +在4.14及以前的内核版本中,使用%p格式符打印的是原始地址。从4.15-rcl +版本开始,使用%p格式符打印的地址会在打印前进行哈希处理。 + +[*]如果启用KALLSYMS并且符号查找失败,则打印原始地址;如果没有启用 +KALLSYSM,则会直接打印原始地址。 + +唯一标识符 +---------- + +内核内存地址绝不可能用作向用户空间公开的标识符。相反,应该使用原子 +计数器,IDR(ID映射表)或类似的唯一标识符。 + +内存初始化 +---------- + +复制到用户空间的内存必须始终被完全初始化,如果没有显式地使用memset() +函数进行初始化,那就需要修改编译器,确保清除结构中的空洞。 + +内存清除 +-------- + +在释放内存时,最好对内存内容进行清除处理,以防止攻击者重用内存中以前 +的内容。例如,在系统调用返回时清除堆栈(CONFIG_GCC_PLUGIN_STACKLEAK), +在释放堆内容是清除其内容。这有助于防止许多未初始化变量攻击、堆栈内容 +泄露、堆内容泄露以及使用后释放攻击(user-after-free)。 + +目标追踪 +-------- + +为了帮助消除导致内核地址被写入用户空间的各种错误,需要跟踪写入的目标。 +如果缓冲区的目标是用户空间(例如,基于seq_file的/proc文件),则应该自 +动审查敏感值。 -- GitLab From f460cd3080134b46b11cb42a7cd268239adc2643 Mon Sep 17 00:00:00 2001 From: zhangwei Date: Fri, 7 Feb 2025 18:18:56 +0800 Subject: [PATCH 169/934] docs/zh_CN: Add keys index Chinese translation Translate .../security/keys/index.rst into Chinese Update the translation through commit 5395d312dff0 ("doc: ReSTify keys-trusted-encrypted.txt") Signed-off-by: zhangwei Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/6e55a93d889871a872a3449cb186c28fb38fd3df.1738923258.git.zhangwei@cqsoftware.com.cn --- .../zh_CN/security/keys/index.rst | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 Documentation/translations/zh_CN/security/keys/index.rst diff --git a/Documentation/translations/zh_CN/security/keys/index.rst b/Documentation/translations/zh_CN/security/keys/index.rst new file mode 100644 index 0000000000000..7c28d003fb0a7 --- /dev/null +++ b/Documentation/translations/zh_CN/security/keys/index.rst @@ -0,0 +1,22 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/keys/index.rst + +:翻译: + + +======== +内核密钥 +======== + +.. toctree:: + :maxdepth: 1 + + +TODOLIST: +* core +* ecryptfs +* request-key +* trusted-encrypted -- GitLab From 2b087edf588c66d149a7ba29bd37f2ad6edbdc9f Mon Sep 17 00:00:00 2001 From: zhangwei Date: Fri, 7 Feb 2025 18:18:57 +0800 Subject: [PATCH 170/934] docs/zh_CN: Add secrets index Chinese translation Translate .../security/secrets/index.rst into Chinese Update the translation through commit 7419995a331c ("docs: security: Add secrets/coco documentation") Signed-off-by: zhangwei Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/57978b69f643c1aacf78804affde4c819960fd3c.1738923258.git.zhangwei@cqsoftware.com.cn --- .../zh_CN/security/secrets/index.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 Documentation/translations/zh_CN/security/secrets/index.rst diff --git a/Documentation/translations/zh_CN/security/secrets/index.rst b/Documentation/translations/zh_CN/security/secrets/index.rst new file mode 100644 index 0000000000000..5ea78713f10e6 --- /dev/null +++ b/Documentation/translations/zh_CN/security/secrets/index.rst @@ -0,0 +1,17 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/secrets/index.rst + +:翻译: + +===================== +密钥文档 +===================== + +.. toctree:: + + +TODOLIST: + +* coco -- GitLab From 3d89178b85a1c1fd106289afba0adfa6d013676a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:17:50 +0100 Subject: [PATCH 171/934] docs: sphinx: remove kernellog.py file In the past, there was a need for a wrapper due to different Sphinx versions support (before Sphinx 1.6). This is long gone, and now it is just a wrapper. Get rig of it to simplify the code. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/48bf16f199250d7048ca164d1b90773861915157.1739182025.git.mchehab+huawei@kernel.org --- Documentation/sphinx/kernel_abi.py | 5 +- Documentation/sphinx/kerneldoc.py | 14 +++--- Documentation/sphinx/kernellog.py | 22 -------- Documentation/sphinx/kfigure.py | 81 ++++++++++++++++-------------- 4 files changed, 53 insertions(+), 69 deletions(-) delete mode 100644 Documentation/sphinx/kernellog.py diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py index 5911bd0d79657..8401562cb5d9d 100644 --- a/Documentation/sphinx/kernel_abi.py +++ b/Documentation/sphinx/kernel_abi.py @@ -37,13 +37,13 @@ import os import subprocess import sys import re -import kernellog from docutils import nodes, statemachine from docutils.statemachine import ViewList from docutils.parsers.rst import directives, Directive from docutils.utils.error_reporting import ErrorString from sphinx.util.docutils import switch_source_input +from sphinx.util import logging __version__ = '1.0' @@ -64,6 +64,7 @@ class KernelCmd(Directive): optional_arguments = 2 has_content = False final_argument_whitespace = True + logger = logging.getLogger('kernel_abi') option_spec = { "debug" : directives.flag, @@ -129,7 +130,7 @@ class KernelCmd(Directive): else: content.append(line, f, ln) - kernellog.info(self.state.document.settings.env.app, "%s: parsed %i lines" % (fname, n)) + self.logger.info("%s: parsed %i lines" % (fname, n)) if content: self.do_parse(content, node) diff --git a/Documentation/sphinx/kerneldoc.py b/Documentation/sphinx/kerneldoc.py index ec1ddfff1863f..be5b8fbf373ff 100644 --- a/Documentation/sphinx/kerneldoc.py +++ b/Documentation/sphinx/kerneldoc.py @@ -39,7 +39,7 @@ from docutils.statemachine import ViewList from docutils.parsers.rst import directives, Directive import sphinx from sphinx.util.docutils import switch_source_input -import kernellog +from sphinx.util import logging __version__ = '1.0' @@ -56,6 +56,7 @@ class KernelDocDirective(Directive): 'functions': directives.unchanged, } has_content = False + logger = logging.getLogger('kerneldoc') def run(self): env = self.state.document.settings.env @@ -109,8 +110,7 @@ class KernelDocDirective(Directive): cmd += [filename] try: - kernellog.verbose(env.app, - 'calling kernel-doc \'%s\'' % (" ".join(cmd))) + self.logger.verbose("calling kernel-doc '%s'" % (" ".join(cmd))) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() @@ -120,8 +120,8 @@ class KernelDocDirective(Directive): if p.returncode != 0: sys.stderr.write(err) - kernellog.warn(env.app, - 'kernel-doc \'%s\' failed with return code %d' % (" ".join(cmd), p.returncode)) + self.logger.warning("kernel-doc '%s' failed with return code %d" + % (" ".join(cmd), p.returncode)) return [nodes.error(None, nodes.paragraph(text = "kernel-doc missing"))] elif env.config.kerneldoc_verbosity > 0: sys.stderr.write(err) @@ -148,8 +148,8 @@ class KernelDocDirective(Directive): return node.children except Exception as e: # pylint: disable=W0703 - kernellog.warn(env.app, 'kernel-doc \'%s\' processing failed with: %s' % - (" ".join(cmd), str(e))) + self.logger.warning("kernel-doc '%s' processing failed with: %s" % + (" ".join(cmd), str(e))) return [nodes.error(None, nodes.paragraph(text = "kernel-doc missing"))] def do_parse(self, result, node): diff --git a/Documentation/sphinx/kernellog.py b/Documentation/sphinx/kernellog.py deleted file mode 100644 index 0bc00c138cade..0000000000000 --- a/Documentation/sphinx/kernellog.py +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Sphinx has deprecated its older logging interface, but the replacement -# only goes back to 1.6. So here's a wrapper layer to keep around for -# as long as we support 1.4. -# -# We don't support 1.4 anymore, but we'll keep the wrappers around until -# we change all the code to not use them anymore :) -# -import sphinx -from sphinx.util import logging - -logger = logging.getLogger('kerneldoc') - -def warn(app, message): - logger.warning(message) - -def verbose(app, message): - logger.verbose(message) - -def info(app, message): - logger.info(message) diff --git a/Documentation/sphinx/kfigure.py b/Documentation/sphinx/kfigure.py index 97166333b7270..383f9a695b082 100644 --- a/Documentation/sphinx/kfigure.py +++ b/Documentation/sphinx/kfigure.py @@ -59,12 +59,14 @@ from docutils.parsers.rst import directives from docutils.parsers.rst.directives import images import sphinx from sphinx.util.nodes import clean_astext -import kernellog +from sphinx.util import logging Figure = images.Figure __version__ = '1.0.0' +logger = logging.getLogger('kfigure') + # simple helper # ------------- @@ -170,7 +172,7 @@ def setupTools(app): """ global dot_cmd, dot_Tpdf, convert_cmd, rsvg_convert_cmd # pylint: disable=W0603 global inkscape_cmd, inkscape_ver_one # pylint: disable=W0603 - kernellog.verbose(app, "kfigure: check installed tools ...") + logger.verbose("kfigure: check installed tools ...") dot_cmd = which('dot') convert_cmd = which('convert') @@ -178,7 +180,7 @@ def setupTools(app): inkscape_cmd = which('inkscape') if dot_cmd: - kernellog.verbose(app, "use dot(1) from: " + dot_cmd) + logger.verbose("use dot(1) from: " + dot_cmd) try: dot_Thelp_list = subprocess.check_output([dot_cmd, '-Thelp'], @@ -190,10 +192,11 @@ def setupTools(app): dot_Tpdf_ptn = b'pdf' dot_Tpdf = re.search(dot_Tpdf_ptn, dot_Thelp_list) else: - kernellog.warn(app, "dot(1) not found, for better output quality install " - "graphviz from https://www.graphviz.org") + logger.warning( + "dot(1) not found, for better output quality install graphviz from https://www.graphviz.org" + ) if inkscape_cmd: - kernellog.verbose(app, "use inkscape(1) from: " + inkscape_cmd) + logger.verbose("use inkscape(1) from: " + inkscape_cmd) inkscape_ver = subprocess.check_output([inkscape_cmd, '--version'], stderr=subprocess.DEVNULL) ver_one_ptn = b'Inkscape 1' @@ -204,26 +207,27 @@ def setupTools(app): else: if convert_cmd: - kernellog.verbose(app, "use convert(1) from: " + convert_cmd) + logger.verbose("use convert(1) from: " + convert_cmd) else: - kernellog.verbose(app, + logger.verbose( "Neither inkscape(1) nor convert(1) found.\n" - "For SVG to PDF conversion, " - "install either Inkscape (https://inkscape.org/) (preferred) or\n" - "ImageMagick (https://www.imagemagick.org)") + "For SVG to PDF conversion, install either Inkscape (https://inkscape.org/) (preferred) or\n" + "ImageMagick (https://www.imagemagick.org)" + ) if rsvg_convert_cmd: - kernellog.verbose(app, "use rsvg-convert(1) from: " + rsvg_convert_cmd) - kernellog.verbose(app, "use 'dot -Tsvg' and rsvg-convert(1) for DOT -> PDF conversion") + logger.verbose("use rsvg-convert(1) from: " + rsvg_convert_cmd) + logger.verbose("use 'dot -Tsvg' and rsvg-convert(1) for DOT -> PDF conversion") dot_Tpdf = False else: - kernellog.verbose(app, + logger.verbose( "rsvg-convert(1) not found.\n" - " SVG rendering of convert(1) is done by ImageMagick-native renderer.") + " SVG rendering of convert(1) is done by ImageMagick-native renderer." + ) if dot_Tpdf: - kernellog.verbose(app, "use 'dot -Tpdf' for DOT -> PDF conversion") + logger.verbose("use 'dot -Tpdf' for DOT -> PDF conversion") else: - kernellog.verbose(app, "use 'dot -Tsvg' and convert(1) for DOT -> PDF conversion") + logger.verbose("use 'dot -Tsvg' and convert(1) for DOT -> PDF conversion") # integrate conversion tools @@ -257,13 +261,12 @@ def convert_image(img_node, translator, src_fname=None): # in kernel builds, use 'make SPHINXOPTS=-v' to see verbose messages - kernellog.verbose(app, 'assert best format for: ' + img_node['uri']) + logger.verbose('assert best format for: ' + img_node['uri']) if in_ext == '.dot': if not dot_cmd: - kernellog.verbose(app, - "dot from graphviz not available / include DOT raw.") + logger.verbose("dot from graphviz not available / include DOT raw.") img_node.replace_self(file2literal(src_fname)) elif translator.builder.format == 'latex': @@ -290,10 +293,11 @@ def convert_image(img_node, translator, src_fname=None): if translator.builder.format == 'latex': if not inkscape_cmd and convert_cmd is None: - kernellog.warn(app, - "no SVG to PDF conversion available / include SVG raw." - "\nIncluding large raw SVGs can cause xelatex error." - "\nInstall Inkscape (preferred) or ImageMagick.") + logger.warning( + "no SVG to PDF conversion available / include SVG raw.\n" + "Including large raw SVGs can cause xelatex error.\n" + "Install Inkscape (preferred) or ImageMagick." + ) img_node.replace_self(file2literal(src_fname)) else: dst_fname = path.join(translator.builder.outdir, fname + '.pdf') @@ -306,15 +310,14 @@ def convert_image(img_node, translator, src_fname=None): _name = dst_fname[len(str(translator.builder.outdir)) + 1:] if isNewer(dst_fname, src_fname): - kernellog.verbose(app, - "convert: {out}/%s already exists and is newer" % _name) + logger.verbose("convert: {out}/%s already exists and is newer" % _name) else: ok = False mkdir(path.dirname(dst_fname)) if in_ext == '.dot': - kernellog.verbose(app, 'convert DOT to: {out}/' + _name) + logger.verbose('convert DOT to: {out}/' + _name) if translator.builder.format == 'latex' and not dot_Tpdf: svg_fname = path.join(translator.builder.outdir, fname + '.svg') ok1 = dot2format(app, src_fname, svg_fname) @@ -325,7 +328,7 @@ def convert_image(img_node, translator, src_fname=None): ok = dot2format(app, src_fname, dst_fname) elif in_ext == '.svg': - kernellog.verbose(app, 'convert SVG to: {out}/' + _name) + logger.verbose('convert SVG to: {out}/' + _name) ok = svg2pdf(app, src_fname, dst_fname) if not ok: @@ -354,7 +357,7 @@ def dot2format(app, dot_fname, out_fname): with open(out_fname, "w") as out: exit_code = subprocess.call(cmd, stdout = out) if exit_code != 0: - kernellog.warn(app, + logger.warning( "Error #%d when calling: %s" % (exit_code, " ".join(cmd))) return bool(exit_code == 0) @@ -388,13 +391,14 @@ def svg2pdf(app, svg_fname, pdf_fname): pass if exit_code != 0: - kernellog.warn(app, "Error #%d when calling: %s" % (exit_code, " ".join(cmd))) + logger.warning("Error #%d when calling: %s" % + (exit_code, " ".join(cmd))) if warning_msg: - kernellog.warn(app, "Warning msg from %s: %s" - % (cmd_name, str(warning_msg, 'utf-8'))) + logger.warning( "Warning msg from %s: %s" % + (cmd_name, str(warning_msg, 'utf-8'))) elif warning_msg: - kernellog.verbose(app, "Warning msg from %s (likely harmless):\n%s" - % (cmd_name, str(warning_msg, 'utf-8'))) + logger.verbose("Warning msg from %s (likely harmless):\n%s" % + (cmd_name, str(warning_msg, 'utf-8'))) return bool(exit_code == 0) @@ -418,7 +422,8 @@ def svg2pdf_by_rsvg(app, svg_fname, pdf_fname): # use stdout and stderr from parent exit_code = subprocess.call(cmd) if exit_code != 0: - kernellog.warn(app, "Error #%d when calling: %s" % (exit_code, " ".join(cmd))) + logger.warning("Error #%d when calling: %s" % + (exit_code, " ".join(cmd))) ok = bool(exit_code == 0) return ok @@ -513,15 +518,15 @@ def visit_kernel_render(self, node): app = self.builder.app srclang = node.get('srclang') - kernellog.verbose(app, 'visit kernel-render node lang: "%s"' % (srclang)) + logger.verbose('visit kernel-render node lang: "%s"' % srclang) tmp_ext = RENDER_MARKUP_EXT.get(srclang, None) if tmp_ext is None: - kernellog.warn(app, 'kernel-render: "%s" unknown / include raw.' % (srclang)) + logger.warning( 'kernel-render: "%s" unknown / include raw.' % srclang) return if not dot_cmd and tmp_ext == '.dot': - kernellog.verbose(app, "dot from graphviz not available / include raw.") + logger.verbose("dot from graphviz not available / include raw.") return literal_block = node[0] -- GitLab From faccc0ec64e193b6aa9894eec134165317ae2a0f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:17:51 +0100 Subject: [PATCH 172/934] docs: sphinx/kernel_abi: adjust coding style Make pylint and flake8 happier with this module's coding style Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/02a9ec0fab61e4c94b9c2ff555bc0e9d93f59100.1739182025.git.mchehab+huawei@kernel.org --- Documentation/sphinx/kernel_abi.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py index 8401562cb5d9d..38653f5706c0f 100644 --- a/Documentation/sphinx/kernel_abi.py +++ b/Documentation/sphinx/kernel_abi.py @@ -32,32 +32,31 @@ u""" """ -import codecs import os +import re import subprocess import sys -import re -from docutils import nodes, statemachine +from docutils import nodes from docutils.statemachine import ViewList from docutils.parsers.rst import directives, Directive -from docutils.utils.error_reporting import ErrorString from sphinx.util.docutils import switch_source_input from sphinx.util import logging -__version__ = '1.0' +__version__ = "1.0" + def setup(app): app.add_directive("kernel-abi", KernelCmd) - return dict( - version = __version__ - , parallel_read_safe = True - , parallel_write_safe = True - ) + return { + "version": __version__, + "parallel_read_safe": True, + "parallel_write_safe": True + } -class KernelCmd(Directive): +class KernelCmd(Directive): u"""KernelABI (``kernel-abi``) directive""" required_arguments = 1 @@ -99,8 +98,8 @@ class KernelCmd(Directive): if "debug" in self.options: code_block = "\n\n.. code-block:: rst\n :linenos:\n" - for l in lines.split("\n"): - code_block += "\n " + l + for line in lines.split("\n"): + code_block += "\n " + line lines = code_block + "\n\n" line_regex = re.compile(r"^\.\. LINENO (\S+)\#([0-9]+)$") -- GitLab From 7ceb84b7262541841509a6edef17448414d63da3 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:17:52 +0100 Subject: [PATCH 173/934] docs: admin-guide: abi: add SPDX tags to ABI files Such files are missing SPDX tags containing the licensing information. Add them. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/8c7bfe676e7349ea2d1930bf918d54e27d15ae9e.1739182025.git.mchehab+huawei@kernel.org --- Documentation/admin-guide/abi-obsolete.rst | 2 ++ Documentation/admin-guide/abi-removed.rst | 2 ++ Documentation/admin-guide/abi-stable.rst | 2 ++ Documentation/admin-guide/abi-testing.rst | 2 ++ Documentation/admin-guide/abi.rst | 2 ++ 5 files changed, 10 insertions(+) diff --git a/Documentation/admin-guide/abi-obsolete.rst b/Documentation/admin-guide/abi-obsolete.rst index 594e697aa1b2f..b655615917f1e 100644 --- a/Documentation/admin-guide/abi-obsolete.rst +++ b/Documentation/admin-guide/abi-obsolete.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ABI obsolete symbols ==================== diff --git a/Documentation/admin-guide/abi-removed.rst b/Documentation/admin-guide/abi-removed.rst index f9e000c81828e..ba941c1af1784 100644 --- a/Documentation/admin-guide/abi-removed.rst +++ b/Documentation/admin-guide/abi-removed.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ABI removed symbols =================== diff --git a/Documentation/admin-guide/abi-stable.rst b/Documentation/admin-guide/abi-stable.rst index fc3361d847b12..5d738f345333e 100644 --- a/Documentation/admin-guide/abi-stable.rst +++ b/Documentation/admin-guide/abi-stable.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ABI stable symbols ================== diff --git a/Documentation/admin-guide/abi-testing.rst b/Documentation/admin-guide/abi-testing.rst index 19767926b3440..a867e6578bf7d 100644 --- a/Documentation/admin-guide/abi-testing.rst +++ b/Documentation/admin-guide/abi-testing.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ABI testing symbols =================== diff --git a/Documentation/admin-guide/abi.rst b/Documentation/admin-guide/abi.rst index bcab3ef2597ca..64e772bde943d 100644 --- a/Documentation/admin-guide/abi.rst +++ b/Documentation/admin-guide/abi.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ===================== Linux ABI description ===================== -- GitLab From 33a8b6509de379e844bf0fc13af23dc3c0fde6e5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:17:53 +0100 Subject: [PATCH 174/934] ABI: sysfs-class-rfkill: fix kernelversion tags Some kernelversion tags are missing colons. Add them to comply with ABI description and produce right results when converted to html/pdf. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/b2e38f7857e8fddad03401a2ae6c5af5ca8db507.1739182025.git.mchehab+huawei@kernel.org --- Documentation/ABI/removed/sysfs-class-rfkill | 2 +- Documentation/ABI/stable/sysfs-class-rfkill | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/ABI/removed/sysfs-class-rfkill b/Documentation/ABI/removed/sysfs-class-rfkill index f25174eafd55a..20cb688af173b 100644 --- a/Documentation/ABI/removed/sysfs-class-rfkill +++ b/Documentation/ABI/removed/sysfs-class-rfkill @@ -4,7 +4,7 @@ For details to this subsystem look at Documentation/driver-api/rfkill.rst. What: /sys/class/rfkill/rfkill[0-9]+/claim Date: 09-Jul-2007 -KernelVersion v2.6.22 +KernelVersion: v2.6.22 Contact: linux-wireless@vger.kernel.org Description: This file was deprecated because there no longer was a way to claim just control over a single rfkill instance. diff --git a/Documentation/ABI/stable/sysfs-class-rfkill b/Documentation/ABI/stable/sysfs-class-rfkill index 037979f7dc4be..67b605e3dd168 100644 --- a/Documentation/ABI/stable/sysfs-class-rfkill +++ b/Documentation/ABI/stable/sysfs-class-rfkill @@ -16,7 +16,7 @@ Description: The rfkill class subsystem folder. What: /sys/class/rfkill/rfkill[0-9]+/name Date: 09-Jul-2007 -KernelVersion v2.6.22 +KernelVersion: v2.6.22 Contact: linux-wireless@vger.kernel.org Description: Name assigned by driver to this key (interface or driver name). Values: arbitrary string. @@ -24,7 +24,7 @@ Values: arbitrary string. What: /sys/class/rfkill/rfkill[0-9]+/type Date: 09-Jul-2007 -KernelVersion v2.6.22 +KernelVersion: v2.6.22 Contact: linux-wireless@vger.kernel.org Description: Driver type string ("wlan", "bluetooth", etc). Values: See include/linux/rfkill.h. @@ -32,7 +32,7 @@ Values: See include/linux/rfkill.h. What: /sys/class/rfkill/rfkill[0-9]+/persistent Date: 09-Jul-2007 -KernelVersion v2.6.22 +KernelVersion: v2.6.22 Contact: linux-wireless@vger.kernel.org Description: Whether the soft blocked state is initialised from non-volatile storage at startup. @@ -44,7 +44,7 @@ Values: A numeric value: What: /sys/class/rfkill/rfkill[0-9]+/state Date: 09-Jul-2007 -KernelVersion v2.6.22 +KernelVersion: v2.6.22 Contact: linux-wireless@vger.kernel.org Description: Current state of the transmitter. This file was scheduled to be removed in 2014, but due to its @@ -67,7 +67,7 @@ Values: A numeric value. What: /sys/class/rfkill/rfkill[0-9]+/hard Date: 12-March-2010 -KernelVersion v2.6.34 +KernelVersion: v2.6.34 Contact: linux-wireless@vger.kernel.org Description: Current hardblock state. This file is read only. Values: A numeric value. @@ -81,7 +81,7 @@ Values: A numeric value. What: /sys/class/rfkill/rfkill[0-9]+/soft Date: 12-March-2010 -KernelVersion v2.6.34 +KernelVersion: v2.6.34 Contact: linux-wireless@vger.kernel.org Description: Current softblock state. This file is read and write. Values: A numeric value. -- GitLab From fc80c4f02639e4a9fe08ed438135bc72b76a5654 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:17:54 +0100 Subject: [PATCH 175/934] ABI: sysfs-bus-coresight-*: fix kernelversion tags Some kernelversion tags are missing colons. Add them to comply with ABI description and produce right results when converted to html/pdf. Signed-off-by: Mauro Carvalho Chehab Acked-by: Suzuki K Poulose Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/72c3a6583c2ffca23ae9ee1c0b6dc98618ae0775.1739182025.git.mchehab+huawei@kernel.org --- .../testing/sysfs-bus-coresight-devices-cti | 78 +++++++++---------- .../testing/sysfs-bus-coresight-devices-tpdm | 52 ++++++------- 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-cti b/Documentation/ABI/testing/sysfs-bus-coresight-devices-cti index bf2869c413e72..a97b70f588da8 100644 --- a/Documentation/ABI/testing/sysfs-bus-coresight-devices-cti +++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-cti @@ -1,241 +1,241 @@ What: /sys/bus/coresight/devices//enable Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Enable/Disable the CTI hardware. What: /sys/bus/coresight/devices//powered Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Indicate if the CTI hardware is powered. What: /sys/bus/coresight/devices//ctmid Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Display the associated CTM ID What: /sys/bus/coresight/devices//nr_trigger_cons Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Number of devices connected to triggers on this CTI What: /sys/bus/coresight/devices//triggers/name Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Name of connected device What: /sys/bus/coresight/devices//triggers/in_signals Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Input trigger signals from connected device What: /sys/bus/coresight/devices//triggers/in_types Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Functional types for the input trigger signals from connected device What: /sys/bus/coresight/devices//triggers/out_signals Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Output trigger signals to connected device What: /sys/bus/coresight/devices//triggers/out_types Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Functional types for the output trigger signals to connected device What: /sys/bus/coresight/devices//regs/inout_sel Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Select the index for inen and outen registers. What: /sys/bus/coresight/devices//regs/inen Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Read or write the CTIINEN register selected by inout_sel. What: /sys/bus/coresight/devices//regs/outen Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Read or write the CTIOUTEN register selected by inout_sel. What: /sys/bus/coresight/devices//regs/gate Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Read or write CTIGATE register. What: /sys/bus/coresight/devices//regs/asicctl Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Read or write ASICCTL register. What: /sys/bus/coresight/devices//regs/intack Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Write the INTACK register. What: /sys/bus/coresight/devices//regs/appset Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Set CTIAPPSET register to activate channel. Read back to determine current value of register. What: /sys/bus/coresight/devices//regs/appclear Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Write APPCLEAR register to deactivate channel. What: /sys/bus/coresight/devices//regs/apppulse Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Write APPPULSE to pulse a channel active for one clock cycle. What: /sys/bus/coresight/devices//regs/chinstatus Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Read current status of channel inputs. What: /sys/bus/coresight/devices//regs/choutstatus Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) read current status of channel outputs. What: /sys/bus/coresight/devices//regs/triginstatus Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) read current status of input trigger signals What: /sys/bus/coresight/devices//regs/trigoutstatus Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) read current status of output trigger signals. What: /sys/bus/coresight/devices//channels/trigin_attach Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Attach a CTI input trigger to a CTM channel. What: /sys/bus/coresight/devices//channels/trigin_detach Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Detach a CTI input trigger from a CTM channel. What: /sys/bus/coresight/devices//channels/trigout_attach Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Attach a CTI output trigger to a CTM channel. What: /sys/bus/coresight/devices//channels/trigout_detach Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Detach a CTI output trigger from a CTM channel. What: /sys/bus/coresight/devices//channels/chan_gate_enable Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Enable CTIGATE for single channel (Write) or list enabled channels through the gate (R). What: /sys/bus/coresight/devices//channels/chan_gate_disable Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Disable CTIGATE for single channel. What: /sys/bus/coresight/devices//channels/chan_set Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Activate a single channel. What: /sys/bus/coresight/devices//channels/chan_clear Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Deactivate a single channel. What: /sys/bus/coresight/devices//channels/chan_pulse Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Pulse a single channel - activate for a single clock cycle. What: /sys/bus/coresight/devices//channels/trigout_filtered Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) List of output triggers filtered across all connections. What: /sys/bus/coresight/devices//channels/trig_filter_enable Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Enable or disable trigger output signal filtering. What: /sys/bus/coresight/devices//channels/chan_inuse Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) show channels with at least one attached trigger signal. What: /sys/bus/coresight/devices//channels/chan_free Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) show channels with no attached trigger signals. What: /sys/bus/coresight/devices//channels/chan_xtrigs_sel Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Write channel number to select a channel to view, read to see selected channel number. What: /sys/bus/coresight/devices//channels/chan_xtrigs_in Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Read to see input triggers connected to selected view channel. What: /sys/bus/coresight/devices//channels/chan_xtrigs_out Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Read to see output triggers connected to selected view channel. What: /sys/bus/coresight/devices//channels/chan_xtrigs_reset Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Clear all channel / trigger programming. diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm b/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm index bf710ea6e0efc..53cb454b60d0e 100644 --- a/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm +++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm @@ -1,6 +1,6 @@ What: /sys/bus/coresight/devices//integration_test Date: January 2023 -KernelVersion 6.2 +KernelVersion: 6.2 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (Write) Run integration test for tpdm. Integration test @@ -14,7 +14,7 @@ Description: What: /sys/bus/coresight/devices//reset_dataset Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (Write) Reset the dataset of the tpdm. @@ -24,7 +24,7 @@ Description: What: /sys/bus/coresight/devices//dsb_trig_type Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the trigger type of the DSB for tpdm. @@ -35,7 +35,7 @@ Description: What: /sys/bus/coresight/devices//dsb_trig_ts Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the trigger timestamp of the DSB for tpdm. @@ -46,7 +46,7 @@ Description: What: /sys/bus/coresight/devices//dsb_mode Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the programming mode of the DSB for tpdm. @@ -60,7 +60,7 @@ Description: What: /sys/bus/coresight/devices//dsb_edge/ctrl_idx Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the index number of the edge detection for the DSB @@ -69,7 +69,7 @@ Description: What: /sys/bus/coresight/devices//dsb_edge/ctrl_val Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: Write a data to control the edge detection corresponding to @@ -85,7 +85,7 @@ Description: What: /sys/bus/coresight/devices//dsb_edge/ctrl_mask Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: Write a data to mask the edge detection corresponding to the index @@ -97,21 +97,21 @@ Description: What: /sys/bus/coresight/devices//dsb_edge/edcr[0:15] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: Read a set of the edge control value of the DSB in TPDM. What: /sys/bus/coresight/devices//dsb_edge/edcmr[0:7] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: Read a set of the edge control mask of the DSB in TPDM. What: /sys/bus/coresight/devices//dsb_trig_patt/xpr[0:7] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the value of the trigger pattern for the DSB @@ -119,7 +119,7 @@ Description: What: /sys/bus/coresight/devices//dsb_trig_patt/xpmr[0:7] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the mask of the trigger pattern for the DSB @@ -127,21 +127,21 @@ Description: What: /sys/bus/coresight/devices//dsb_patt/tpr[0:7] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the value of the pattern for the DSB subunit TPDM. What: /sys/bus/coresight/devices//dsb_patt/tpmr[0:7] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the mask of the pattern for the DSB subunit TPDM. What: /sys/bus/coresight/devices//dsb_patt/enable_ts Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (Write) Set the pattern timestamp of DSB tpdm. Read @@ -153,7 +153,7 @@ Description: What: /sys/bus/coresight/devices//dsb_patt/set_type Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (Write) Set the pattern type of DSB tpdm. Read @@ -165,7 +165,7 @@ Description: What: /sys/bus/coresight/devices//dsb_msr/msr[0:31] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the MSR(mux select register) for the DSB subunit @@ -173,7 +173,7 @@ Description: What: /sys/bus/coresight/devices//cmb_mode Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (Write) Set the data collection mode of CMB tpdm. Continuous change creates CMB data set elements on every CMBCLK edge. @@ -187,7 +187,7 @@ Description: (Write) Set the data collection mode of CMB tpdm. Continuous What: /sys/bus/coresight/devices//cmb_trig_patt/xpr[0:1] Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the value of the trigger pattern for the CMB @@ -195,7 +195,7 @@ Description: What: /sys/bus/coresight/devices//cmb_trig_patt/xpmr[0:1] Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the mask of the trigger pattern for the CMB @@ -203,21 +203,21 @@ Description: What: /sys/bus/coresight/devices//dsb_patt/tpr[0:1] Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the value of the pattern for the CMB subunit TPDM. What: /sys/bus/coresight/devices//dsb_patt/tpmr[0:1] Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the mask of the pattern for the CMB subunit TPDM. What: /sys/bus/coresight/devices//cmb_patt/enable_ts Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (Write) Set the pattern timestamp of CMB tpdm. Read @@ -229,7 +229,7 @@ Description: What: /sys/bus/coresight/devices//cmb_trig_ts Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the trigger timestamp of the CMB for tpdm. @@ -240,7 +240,7 @@ Description: What: /sys/bus/coresight/devices//cmb_ts_all Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Read or write the status of timestamp upon all interface. @@ -252,7 +252,7 @@ Description: What: /sys/bus/coresight/devices//cmb_msr/msr[0:31] Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) , Tao Zhang (QUIC) Description: (RW) Set/Get the MSR(mux select register) for the CMB subunit -- GitLab From a396f6297933651a3cdb3e59af35b5aa189dccdb Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:17:55 +0100 Subject: [PATCH 176/934] ABI: sysfs-driver-dma-idxd: fix date tags Some date tags are missing colons. Add them to comply with ABI description and produce right results when converted to html/pdf. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/d5b7641e7a6ed461d889db5198cb557a68f27a6d.1739182025.git.mchehab+huawei@kernel.org --- Documentation/ABI/stable/sysfs-driver-dma-idxd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd index f2ec42949a54d..4a355e6747ae5 100644 --- a/Documentation/ABI/stable/sysfs-driver-dma-idxd +++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd @@ -246,14 +246,14 @@ Description: Controls whether PRS disable is turned on for the workqueue. capability. What: /sys/bus/dsa/devices/wq./occupancy -Date May 25, 2021 +Date: May 25, 2021 KernelVersion: 5.14.0 Contact: dmaengine@vger.kernel.org Description: Show the current number of entries in this WQ if WQ Occupancy Support bit WQ capabilities is 1. What: /sys/bus/dsa/devices/wq./enqcmds_retries -Date Oct 29, 2021 +Date: Oct 29, 2021 KernelVersion: 5.17.0 Contact: dmaengine@vger.kernel.org Description: Indicate the number of retires for an enqcmds submission on a sharedwq. -- GitLab From 90800df0da787cd310edc7715ab7787964e93cf9 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:17:56 +0100 Subject: [PATCH 177/934] ABI: sysfs-fs-f2fs: fix date tags Some date tags are missing colons. Add them to comply with ABI description and produce right results when converted to html/pdf. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/336ab631c0636e419282a38e7dd5b5cfb52fcd2d.1739182025.git.mchehab+huawei@kernel.org --- Documentation/ABI/testing/sysfs-fs-f2fs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 3e1630c70d8ae..e44bb614964b3 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -347,7 +347,7 @@ Description: Used to control configure extension list: - [c] means add/del cold file extension What: /sys/fs/f2fs//unusable -Date April 2019 +Date: April 2019 Contact: "Daniel Rosenberg" Description: If checkpoint=disable, it displays the number of blocks that are unusable. @@ -355,7 +355,7 @@ Description: If checkpoint=disable, it displays the number of blocks that would be unusable if checkpoint=disable were to be set. What: /sys/fs/f2fs//encoding -Date July 2019 +Date: July 2019 Contact: "Daniel Rosenberg" Description: Displays name and version of the encoding set for the filesystem. If no encoding is set, displays (none) -- GitLab From 01d009147946e753eac75ac9079cdce0ce00df44 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:17:57 +0100 Subject: [PATCH 178/934] ABI: sysfs-power: fix a what tag There is one What tag that it is using semicolon instead of colon. Fix it to comply with ABI description and produce right results when converted to html/pdf. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/508051136ea2e07e0dd7fa41ff40382387c24ba8.1739182025.git.mchehab+huawei@kernel.org --- Documentation/ABI/testing/sysfs-power | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index a3942b1036e25..2192478e83cfc 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -131,7 +131,7 @@ Description: CAUTION: Using it will cause your machine's real-time (CMOS) clock to be set to a random invalid time after a resume. -What; /sys/power/pm_trace_dev_match +What: /sys/power/pm_trace_dev_match Date: October 2010 Contact: James Hogan Description: -- GitLab From 790ca8b0b5a343090b148d9c0c1ddb2d1a012952 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:17:58 +0100 Subject: [PATCH 179/934] scripts/documentation-file-ref-check: don't check perl/python scripts Such scripts may have regular expressions, which would make the parser confusing. Also, they shouldn't hardcode filenames there, so skipping them is OK. While here, also don't check references on extensions used for file backup and patch rej/orig. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/712bfc8412ee5ad8ab43dd21a8c30fc858eff5a6.1739182025.git.mchehab+huawei@kernel.org --- scripts/documentation-file-ref-check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/documentation-file-ref-check b/scripts/documentation-file-ref-check index 68083f2f11220..408b1dbe78841 100755 --- a/scripts/documentation-file-ref-check +++ b/scripts/documentation-file-ref-check @@ -92,7 +92,7 @@ while () { next if ($f =~ m,^Next/,); # Makefiles and scripts contain nasty expressions to parse docs - next if ($f =~ m/Makefile/ || $f =~ m/\.sh$/); + next if ($f =~ m/Makefile/ || $f =~ m/\.(sh|py|pl|~|rej|org|orig)$/); # It doesn't make sense to parse hidden files next if ($f =~ m#/\.#); -- GitLab From 484e9aa6efaf96a7a5b5fe3216f24973166fbfe3 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:17:59 +0100 Subject: [PATCH 180/934] scripts/get_abi.py: add a Python tool to generate ReST output The get_abi.pl script is requiring some care, but it seems that the number of changes on it since when I originally wrote it was not too high. Maintaining perl scripts without using classes requires a higher efforted than on python, due to global variables management. Also, it sounds easier to find python developer those days than perl ones. As a plus, using a Python class to handle ABI allows a better integration with Sphinx extensions, allowing, for instance, to let automarkup to generate cross-references for ABI symbols. With that in mind, rewrite the core of get_abi.pl in Python, using classes, to help producing documentation. This will allow a better integration in the future with the Sphinx ABI extension. The algorithms used there are the same as the ones in Perl, with a couple of cleanups to remove redundant variables and to help with cross-reference generation. While doing that, remove some code that were important in the past, where ABI files weren't using ReST format. Some minor improvements were added like using a fixed seed when generating ABI keys for duplicated names, making its results reproductible. The end script is a little bit faster than the original one (tested on a machine with ssd disks). That's probably because we're now using only pre-compiled regular expressions, and it is using string replacement methods instead of regex where possible. The new version is a little bit more conservative when converting text to cross-references to avoid adding them into literal blocks. To ensure that the ReST output is parsing all variables and files properly, the end result was compared using diff with the one produced by the perl script and showed no regressions. There are minor improvements at the results, as it now properly groups What on some special cases. It also better escape some XREF names. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/71a894211a8b69664711144d9c4f8a0e73d1ae3c.1739182025.git.mchehab+huawei@kernel.org --- scripts/get_abi.py | 118 ++++++++ scripts/lib/abi/abi_parser.py | 512 ++++++++++++++++++++++++++++++++++ scripts/lib/abi/helpers.py | 28 ++ 3 files changed, 658 insertions(+) create mode 100755 scripts/get_abi.py create mode 100644 scripts/lib/abi/abi_parser.py create mode 100644 scripts/lib/abi/helpers.py diff --git a/scripts/get_abi.py b/scripts/get_abi.py new file mode 100755 index 0000000000000..bb17c54feeffe --- /dev/null +++ b/scripts/get_abi.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +# pylint: disable=R0903 +# Copyright(c) 2025: Mauro Carvalho Chehab . +# SPDX-License-Identifier: GPL-2.0 + +""" +Parse ABI documentation and produce results from it. +""" + +import argparse +import logging +import os +import sys + +# Import Python modules + +LIB_DIR = "lib/abi" +SRC_DIR = os.path.dirname(os.path.realpath(__file__)) + +sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR)) + +from abi_parser import AbiParser # pylint: disable=C0413 +from helpers import ABI_DIR, DEBUG_HELP # pylint: disable=C0413 + +# Command line classes + + +REST_DESC = """ +Produce output in ReST format. + +The output is done on two sections: + +- Symbols: show all parsed symbols in alphabetic order; +- Files: cross reference the content of each file with the symbols on it. +""" + +class AbiRest: + """Initialize an argparse subparser for rest output""" + + def __init__(self, subparsers): + """Initialize argparse subparsers""" + + parser = subparsers.add_parser("rest", + formatter_class=argparse.RawTextHelpFormatter, + description=REST_DESC) + + parser.add_argument("--enable-lineno", action="store_true", + help="enable lineno") + parser.add_argument("--raw", action="store_true", + help="output text as contained in the ABI files. " + "It not used, output will contain dynamically" + " generated cross references when possible.") + parser.add_argument("--no-file", action="store_true", + help="Don't the files section") + parser.add_argument("--show-hints", help="Show-hints") + + parser.set_defaults(func=self.run) + + def run(self, args): + """Run subparser""" + + parser = AbiParser(args.dir, debug=args.debug) + parser.parse_abi() + parser.check_issues() + parser.print_data(args.enable_lineno, args.raw, not args.no_file) + + +class AbiValidate: + """Initialize an argparse subparser for ABI validation""" + + def __init__(self, subparsers): + """Initialize argparse subparsers""" + + parser = subparsers.add_parser("validate", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="list events") + + parser.set_defaults(func=self.run) + + def run(self, args): + """Run subparser""" + + parser = AbiParser(args.dir, debug=args.debug) + parser.parse_abi() + parser.check_issues() + + +def main(): + """Main program""" + + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument("-d", "--debug", type=int, default=0, help="debug level") + parser.add_argument("-D", "--dir", default=ABI_DIR, help=DEBUG_HELP) + + subparsers = parser.add_subparsers() + + AbiRest(subparsers) + AbiValidate(subparsers) + + args = parser.parse_args() + + if args.debug: + level = logging.DEBUG + else: + level = logging.INFO + + logging.basicConfig(level=level, format="[%(levelname)s] %(message)s") + + if "func" in args: + args.func(args) + else: + sys.exit(f"Please specify a valid command for {sys.argv[0]}") + + +# Call main method +if __name__ == "__main__": + main() diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py new file mode 100644 index 0000000000000..b3fa70eee4128 --- /dev/null +++ b/scripts/lib/abi/abi_parser.py @@ -0,0 +1,512 @@ +#!/usr/bin/env python3 +# pylint: disable=R0902,R0903,R0911,R0912,R0913,R0914,R0915,R0917,C0302 +# Copyright(c) 2025: Mauro Carvalho Chehab . +# SPDX-License-Identifier: GPL-2.0 + +""" +Parse ABI documentation and produce results from it. +""" + +from argparse import Namespace +import logging +import os +import re + +from glob import glob +from pprint import pformat +from random import randrange, seed + +# Import Python modules + +from helpers import AbiDebug, ABI_DIR + + +class AbiParser: + """Main class to parse ABI files""" + + TAGS = r"(what|where|date|kernelversion|contact|description|users)" + XREF = r"(?:^|\s|\()(\/(?:sys|config|proc|dev|kvd)\/[^,.:;\)\s]+)(?:[,.:;\)\s]|\Z)" + + def __init__(self, directory, logger=None, + enable_lineno=False, show_warnings=True, debug=0): + """Stores arguments for the class and initialize class vars""" + + self.directory = directory + self.enable_lineno = enable_lineno + self.show_warnings = show_warnings + self.debug = debug + + if not logger: + self.log = logging.getLogger("get_abi") + else: + self.log = logger + + self.data = {} + self.what_symbols = {} + self.file_refs = {} + self.what_refs = {} + + # Regular expressions used on parser + self.re_tag = re.compile(r"(\S+)(:\s*)(.*)", re.I) + self.re_valid = re.compile(self.TAGS) + self.re_start_spc = re.compile(r"(\s*)(\S.*)") + self.re_whitespace = re.compile(r"^\s+") + + # Regular used on print + self.re_what = re.compile(r"(\/?(?:[\w\-]+\/?){1,2})") + self.re_escape = re.compile(r"([\.\x01-\x08\x0e-\x1f\x21-\x2f\x3a-\x40\x7b-\xff])") + self.re_unprintable = re.compile(r"([\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\xff]+)") + self.re_title_mark = re.compile(r"\n[\-\*\=\^\~]+\n") + self.re_doc = re.compile(r"Documentation/(?!devicetree)(\S+)\.rst") + self.re_abi = re.compile(r"(Documentation/ABI/)([\w\/\-]+)") + self.re_xref_node = re.compile(self.XREF) + + def warn(self, fdata, msg, extra=None): + """Displays a parse error if warning is enabled""" + + if not self.show_warnings: + return + + msg = f"{fdata.fname}:{fdata.ln}: {msg}" + if extra: + msg += "\n\t\t" + extra + + self.log.warning(msg) + + def add_symbol(self, what, fname, ln=None, xref=None): + """Create a reference table describing where each 'what' is located""" + + if what not in self.what_symbols: + self.what_symbols[what] = {"file": {}} + + if fname not in self.what_symbols[what]["file"]: + self.what_symbols[what]["file"][fname] = [] + + if ln and ln not in self.what_symbols[what]["file"][fname]: + self.what_symbols[what]["file"][fname].append(ln) + + if xref: + self.what_symbols[what]["xref"] = xref + + def _parse_line(self, fdata, line): + """Parse a single line of an ABI file""" + + new_what = False + new_tag = False + content = None + + match = self.re_tag.match(line) + if match: + new = match.group(1).lower() + sep = match.group(2) + content = match.group(3) + + match = self.re_valid.search(new) + if match: + new_tag = match.group(1) + else: + if fdata.tag == "description": + # New "tag" is actually part of description. + # Don't consider it a tag + new_tag = False + elif fdata.tag != "": + self.warn(fdata, f"tag '{fdata.tag}' is invalid", line) + + if new_tag: + # "where" is Invalid, but was a common mistake. Warn if found + if new_tag == "where": + self.warn(fdata, "tag 'Where' is invalid. Should be 'What:' instead") + new_tag = "what" + + if new_tag == "what": + fdata.space = None + + if content not in self.what_symbols: + self.add_symbol(what=content, fname=fdata.fname, ln=fdata.ln) + + if fdata.tag == "what": + fdata.what.append(content.strip("\n")) + else: + if fdata.key: + if "description" not in self.data.get(fdata.key, {}): + self.warn(fdata, f"{fdata.key} doesn't have a description") + + for w in fdata.what: + self.add_symbol(what=w, fname=fdata.fname, + ln=fdata.what_ln, xref=fdata.key) + + fdata.label = content + new_what = True + + key = "abi_" + content.lower() + fdata.key = self.re_unprintable.sub("_", key).strip("_") + + # Avoid duplicated keys but using a defined seed, to make + # the namespace identical if there aren't changes at the + # ABI symbols + seed(42) + + while fdata.key in self.data: + char = randrange(0, 51) + ord("A") + if char > ord("Z"): + char += ord("a") - ord("Z") - 1 + + fdata.key += chr(char) + + if fdata.key and fdata.key not in self.data: + self.data[fdata.key] = { + "what": [content], + "file": [fdata.file_ref], + "line_no": fdata.ln, + } + + fdata.what = self.data[fdata.key]["what"] + + self.what_refs[content] = fdata.key + fdata.tag = new_tag + fdata.what_ln = fdata.ln + + if fdata.nametag["what"]: + t = (content, fdata.key) + if t not in fdata.nametag["symbols"]: + fdata.nametag["symbols"].append(t) + + return + + if fdata.tag and new_tag: + fdata.tag = new_tag + + if new_what: + fdata.label = "" + + self.data[fdata.key]["type"] = fdata.ftype + + if "description" in self.data[fdata.key]: + self.data[fdata.key]["description"] += "\n\n" + + if fdata.file_ref not in self.data[fdata.key]["file"]: + self.data[fdata.key]["file"].append(fdata.file_ref) + + if self.debug == AbiDebug.WHAT_PARSING: + self.log.debug("what: %s", fdata.what) + + if not fdata.what: + self.warn(fdata, "'What:' should come first:", line) + return + + if new_tag == "description": + fdata.space = None + + if content: + sep = sep.replace(":", " ") + + c = " " * len(new_tag) + sep + content + c = c.expandtabs() + + match = self.re_start_spc.match(c) + if match: + # Preserve initial spaces for the first line + fdata.space = match.group(1) + content = match.group(2) + "\n" + + self.data[fdata.key][fdata.tag] = content + + return + + # Store any contents before tags at the database + if not fdata.tag and "what" in fdata.nametag: + fdata.nametag["description"] += line + return + + if fdata.tag == "description": + content = line.expandtabs() + + if self.re_whitespace.sub("", content) == "": + self.data[fdata.key][fdata.tag] += "\n" + return + + if fdata.space is None: + match = self.re_start_spc.match(content) + if match: + # Preserve initial spaces for the first line + fdata.space = match.group(1) + + content = match.group(2) + "\n" + else: + if content.startswith(fdata.space): + content = content[len(fdata.space):] + + else: + fdata.space = "" + + if fdata.tag == "what": + w = content.strip("\n") + if w: + self.data[fdata.key][fdata.tag].append(w) + else: + self.data[fdata.key][fdata.tag] += content + return + + content = line.strip() + if fdata.tag: + if fdata.tag == "what": + w = content.strip("\n") + if w: + self.data[fdata.key][fdata.tag].append(w) + else: + self.data[fdata.key][fdata.tag] += "\n" + content.rstrip("\n") + return + + # Everything else is error + if content: + self.warn(fdata, "Unexpected content", line) + + def parse_file(self, fname, path, basename): + """Parse a single file""" + + ref = f"abi_file_{path}_{basename}" + ref = self.re_unprintable.sub("_", ref).strip("_") + + # Store per-file state into a namespace variable. This will be used + # by the per-line parser state machine and by the warning function. + fdata = Namespace + + fdata.fname = fname + fdata.name = basename + + pos = fname.find(ABI_DIR) + if pos > 0: + f = fname[pos:] + else: + f = fname + + fdata.file_ref = (f, ref) + self.file_refs[f] = ref + + fdata.ln = 0 + fdata.what_ln = 0 + fdata.tag = "" + fdata.label = "" + fdata.what = [] + fdata.key = None + fdata.xrefs = None + fdata.space = None + fdata.ftype = path.split("/")[0] + + fdata.nametag = {} + fdata.nametag["what"] = [f"File {path}/{basename}"] + fdata.nametag["type"] = "File" + fdata.nametag["file"] = [fdata.file_ref] + fdata.nametag["line_no"] = 1 + fdata.nametag["description"] = "" + fdata.nametag["symbols"] = [] + + self.data[ref] = fdata.nametag + + if self.debug & AbiDebug.WHAT_OPEN: + self.log.debug("Opening file %s", fname) + + with open(fname, "r", encoding="utf8", errors="backslashreplace") as fp: + for line in fp: + fdata.ln += 1 + + self._parse_line(fdata, line) + + if "description" in fdata.nametag: + fdata.nametag["description"] = fdata.nametag["description"].lstrip("\n") + + if fdata.key: + if "description" not in self.data.get(fdata.key, {}): + self.warn(fdata, f"{fdata.key} doesn't have a description") + + for w in fdata.what: + self.add_symbol(what=w, fname=fname, xref=fdata.key) + + def parse_abi(self): + """Parse documentation ABI""" + + ignore_suffixes = ("rej", "org", "orig", "bak", "~") + re_abi = re.compile(r".*" + ABI_DIR) + + for fname in glob(os.path.join(self.directory, "**"), recursive=True): + if os.path.isdir(fname): + continue + + basename = os.path.basename(fname) + + if basename == "README": + continue + if basename.startswith(".") or basename.endswith(ignore_suffixes): + continue + + path = re_abi.sub("", os.path.dirname(fname)) + + self.parse_file(fname, path, basename) + + if self.debug & AbiDebug.DUMP_ABI_STRUCTS: + self.log.debug(pformat(self.data)) + + def print_desc_txt(self, desc): + """Print description as found inside ABI files""" + + desc = desc.strip(" \t\n") + + print(desc + "\n") + + def print_desc_rst(self, desc): + """Enrich ReST output by creating cross-references""" + + # Remove title markups from the description + # Having titles inside ABI files will only work if extra + # care would be taken in order to strictly follow the same + # level order for each markup. + desc = self.re_title_mark.sub("\n\n", "\n" + desc) + desc = desc.rstrip(" \t\n").lstrip("\n") + + # Python's regex performance for non-compiled expressions is a lot + # than Perl, as Perl automatically caches them at their + # first usage. Here, we'll need to do the same, as otherwise the + # performance penalty is be high + + new_desc = "" + for d in desc.split("\n"): + if d == "": + new_desc += "\n" + continue + + # Use cross-references for doc files where needed + d = self.re_doc.sub(r":doc:`/\1`", d) + + # Use cross-references for ABI generated docs where needed + matches = self.re_abi.findall(d) + for m in matches: + abi = m[0] + m[1] + + xref = self.file_refs.get(abi) + if not xref: + # This may happen if ABI is on a separate directory, + # like parsing ABI testing and symbol is at stable. + # The proper solution is to move this part of the code + # for it to be inside sphinx/kernel_abi.py + self.log.info("Didn't find ABI reference for '%s'", abi) + else: + new = self.re_escape.sub(r"\\\1", m[1]) + d = re.sub(fr"\b{abi}\b", f":ref:`{new} <{xref}>`", d) + + # Seek for cross reference symbols like /sys/... + # Need to be careful to avoid doing it on a code block + if d[0] not in [" ", "\t"]: + matches = self.re_xref_node.findall(d) + for m in matches: + # Finding ABI here is more complex due to wildcards + xref = self.what_refs.get(m) + if xref: + new = self.re_escape.sub(r"\\\1", m) + d = re.sub(fr"\b{m}\b", f":ref:`{new} <{xref}>`", d) + + new_desc += d + "\n" + + print(new_desc + "\n") + + def print_data(self, enable_lineno, output_in_txt, show_file=False): + """Print ABI at stdout""" + + part = None + for key, v in sorted(self.data.items(), + key=lambda x: (x[1].get("type", ""), + x[1].get("what"))): + + wtype = v.get("type", "Var") + file_ref = v.get("file") + names = v.get("what", [""]) + + if not show_file and wtype == "File": + continue + + if enable_lineno: + ln = v.get("line_no", 1) + print(f".. LINENO {file_ref[0][0]}#{ln}\n") + + if wtype != "File": + cur_part = names[0] + if cur_part.find("/") >= 0: + match = self.re_what.match(cur_part) + if match: + symbol = match.group(1).rstrip("/") + cur_part = "Symbols under " + symbol + + if cur_part and cur_part != part: + part = cur_part + print(f"{part}\n{"-" * len(part)}\n") + + print(f".. _{key}:\n") + + max_len = 0 + for i in range(0, len(names)): # pylint: disable=C0200 + names[i] = "**" + self.re_escape.sub(r"\\\1", names[i]) + "**" + + max_len = max(max_len, len(names[i])) + + print("+-" + "-" * max_len + "-+") + for name in names: + print(f"| {name}" + " " * (max_len - len(name)) + " |") + print("+-" + "-" * max_len + "-+") + print() + + for ref in file_ref: + if wtype == "File": + print(f".. _{ref[1]}:\n") + else: + base = os.path.basename(ref[0]) + print(f"Defined on file :ref:`{base} <{ref[1]}>`\n") + + if wtype == "File": + print(f"{names[0]}\n{"-" * len(names[0])}\n") + + desc = v.get("description") + if not desc and wtype != "File": + print(f"DESCRIPTION MISSING for {names[0]}\n") + + if desc: + if output_in_txt: + self.print_desc_txt(desc) + else: + self.print_desc_rst(desc) + + symbols = v.get("symbols") + if symbols: + print("Has the following ABI:\n") + + for w, label in symbols: + # Escape special chars from content + content = self.re_escape.sub(r"\\\1", w) + + print(f"- :ref:`{content} <{label}>`\n") + + users = v.get("users") + if users and users.strip(" \t\n"): + print(f"Users:\n\t{users.strip("\n").replace('\n', '\n\t')}\n") + + def check_issues(self): + """Warn about duplicated ABI entries""" + + for what, v in self.what_symbols.items(): + files = v.get("file") + if not files: + # Should never happen if the parser works properly + self.log.warning("%s doesn't have a file associated", what) + continue + + if len(files) == 1: + continue + + f = [] + for fname, lines in sorted(files.items()): + if not lines: + f.append(f"{fname}") + elif len(lines) == 1: + f.append(f"{fname}:{lines[0]}") + else: + f.append(f"{fname} lines {", ".join(str(x) for x in lines)}") + + self.log.warning("%s is defined %d times: %s", what, len(f), "; ".join(f)) diff --git a/scripts/lib/abi/helpers.py b/scripts/lib/abi/helpers.py new file mode 100644 index 0000000000000..84a253ed50587 --- /dev/null +++ b/scripts/lib/abi/helpers.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +# Copyright(c) 2025: Mauro Carvalho Chehab . +# pylint: disable=R0903 +# SPDX-License-Identifier: GPL-2.0 + +""" +Helper classes for ABI parser +""" + +ABI_DIR = "Documentation/ABI/" + + +class AbiDebug: + """Debug levels""" + + WHAT_PARSING = 1 + WHAT_OPEN = 2 + DUMP_ABI_STRUCTS = 4 + + +DEBUG_HELP = """ +Print debug information according with the level(s), +which is given by the following bitmask: + +1 - enable debug parsing logic +2 - enable debug messages on file open +4 - enable debug for ABI parse data +""" -- GitLab From 6b48bea16848dd7c771411db3dcc01b3bc4dd4c2 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:00 +0100 Subject: [PATCH 181/934] scripts/get_abi.py: add support for symbol search Add support for searching symbols from Documentation/ABI using regular expressions to match the symbols' names. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/21b2c48657dde112d5417dcd7e0aa7cd383b9a0a.1739182025.git.mchehab+huawei@kernel.org --- scripts/get_abi.py | 24 ++++++++++++++++ scripts/lib/abi/abi_parser.py | 52 +++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/scripts/get_abi.py b/scripts/get_abi.py index bb17c54feeffe..30439f21fdd00 100755 --- a/scripts/get_abi.py +++ b/scripts/get_abi.py @@ -85,6 +85,29 @@ class AbiValidate: parser.check_issues() +class AbiSearch: + """Initialize an argparse subparser for ABI search""" + + def __init__(self, subparsers): + """Initialize argparse subparsers""" + + parser = subparsers.add_parser("search", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Search ABI using a regular expression") + + parser.add_argument("expression", + help="Case-insensitive search pattern for the ABI symbol") + + parser.set_defaults(func=self.run) + + def run(self, args): + """Run subparser""" + + parser = AbiParser(args.dir, debug=args.debug) + parser.parse_abi() + parser.search_symbols(args.expression) + + def main(): """Main program""" @@ -97,6 +120,7 @@ def main(): AbiRest(subparsers) AbiValidate(subparsers) + AbiSearch(subparsers) args = parser.parse_args() diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py index b3fa70eee4128..bea7f1a761658 100644 --- a/scripts/lib/abi/abi_parser.py +++ b/scripts/lib/abi/abi_parser.py @@ -510,3 +510,55 @@ class AbiParser: f.append(f"{fname} lines {", ".join(str(x) for x in lines)}") self.log.warning("%s is defined %d times: %s", what, len(f), "; ".join(f)) + + def search_symbols(self, expr): + """ Searches for ABI symbols """ + + regex = re.compile(expr, re.I) + + found_keys = 0 + for t in sorted(self.data.items(), key=lambda x: [0]): + v = t[1] + + wtype = v.get("type", "") + if wtype == "File": + continue + + for what in v.get("what", [""]): + if regex.search(what): + found_keys += 1 + + kernelversion = v.get("kernelversion", "").strip(" \t\n") + date = v.get("date", "").strip(" \t\n") + contact = v.get("contact", "").strip(" \t\n") + users = v.get("users", "").strip(" \t\n") + desc = v.get("description", "").strip(" \t\n") + + files = [] + for f in v.get("file", ()): + files.append(f[0]) + + what = str(found_keys) + ". " + what + title_tag = "-" * len(what) + + print(f"\n{what}\n{title_tag}\n") + + if kernelversion: + print(f"Kernel version:\t\t{kernelversion}") + + if date: + print(f"Date:\t\t\t{date}") + + if contact: + print(f"Contact:\t\t{contact}") + + if users: + print(f"Users:\t\t\t{users}") + + print(f"Defined on file{'s'[:len(files) ^ 1]}:\t{", ".join(files)}") + + if desc: + print(f"\n{desc.strip("\n")}\n") + + if not found_keys: + print(f"Regular expression /{expr}/ not found.") -- GitLab From 9d7ec8867960d731143280eb86d21a994fffb3de Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:01 +0100 Subject: [PATCH 182/934] docs: use get_abi.py for ABI generation Use the new script instead of the old one when generating ABI docs. For now, execute it via exec. Future changes will better integrate it by using the class defined there directly. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/e7fcb121c0612c94f6f54f0d742cd3a26a46cd7d.1739182025.git.mchehab+huawei@kernel.org --- Documentation/admin-guide/abi-obsolete.rst | 1 - Documentation/admin-guide/abi-removed.rst | 1 - Documentation/admin-guide/abi-stable.rst | 1 - Documentation/admin-guide/abi-testing.rst | 1 - Documentation/sphinx/kernel_abi.py | 10 +++------- 5 files changed, 3 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/abi-obsolete.rst b/Documentation/admin-guide/abi-obsolete.rst index b655615917f1e..6d4d9ab7b8c3d 100644 --- a/Documentation/admin-guide/abi-obsolete.rst +++ b/Documentation/admin-guide/abi-obsolete.rst @@ -10,4 +10,3 @@ The description of the interface will document the reason why it is obsolete and when it can be expected to be removed. .. kernel-abi:: ABI/obsolete - :rst: diff --git a/Documentation/admin-guide/abi-removed.rst b/Documentation/admin-guide/abi-removed.rst index ba941c1af1784..9fc78af6f0771 100644 --- a/Documentation/admin-guide/abi-removed.rst +++ b/Documentation/admin-guide/abi-removed.rst @@ -4,4 +4,3 @@ ABI removed symbols =================== .. kernel-abi:: ABI/removed - :rst: diff --git a/Documentation/admin-guide/abi-stable.rst b/Documentation/admin-guide/abi-stable.rst index 5d738f345333e..c47c2a295865b 100644 --- a/Documentation/admin-guide/abi-stable.rst +++ b/Documentation/admin-guide/abi-stable.rst @@ -13,4 +13,3 @@ Most interfaces (like syscalls) are expected to never change and always be available. .. kernel-abi:: ABI/stable - :rst: diff --git a/Documentation/admin-guide/abi-testing.rst b/Documentation/admin-guide/abi-testing.rst index a867e6578bf7d..40b31985e5874 100644 --- a/Documentation/admin-guide/abi-testing.rst +++ b/Documentation/admin-guide/abi-testing.rst @@ -19,4 +19,3 @@ name to the description of these interfaces, so that the kernel developers can easily notify them if any changes occur. .. kernel-abi:: ABI/testing - :rst: diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py index 38653f5706c0f..f314b888d3dec 100644 --- a/Documentation/sphinx/kernel_abi.py +++ b/Documentation/sphinx/kernel_abi.py @@ -14,7 +14,7 @@ u""" :license: GPL Version 2, June 1991 see Linux/COPYING for details. The ``kernel-abi`` (:py:class:`KernelCmd`) directive calls the - scripts/get_abi.pl script to parse the Kernel ABI files. + scripts/get_abi.py script to parse the Kernel ABI files. Overview of directive's argument and options. @@ -67,7 +67,6 @@ class KernelCmd(Directive): option_spec = { "debug" : directives.flag, - "rst" : directives.unchanged } def run(self): @@ -78,15 +77,12 @@ class KernelCmd(Directive): srctree = os.path.abspath(os.environ["srctree"]) args = [ - os.path.join(srctree, 'scripts/get_abi.pl'), + os.path.join(srctree, 'scripts/get_abi.py'), + '-D', os.path.join(srctree, 'Documentation', self.arguments[0]), 'rest', '--enable-lineno', - '--dir', os.path.join(srctree, 'Documentation', self.arguments[0]), ] - if 'rst' in self.options: - args.append('--rst-source') - lines = subprocess.check_output(args, cwd=os.path.dirname(doc.current_source)).decode('utf-8') nodeList = self.nestedParse(lines, self.arguments[0]) return nodeList -- GitLab From c67c3fbdd917884e38a366c38717c9f769075c15 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:02 +0100 Subject: [PATCH 183/934] scripts/lib/abi/abi_parser.py: optimize parse_abi() function Instead of using glob, use a recursive function to parse all files. Such change reduces the total excecution time by 15% with my SSD disks. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/190dd358897017ed82c56f1e263192215ffbae43.1739182025.git.mchehab+huawei@kernel.org --- scripts/lib/abi/abi_parser.py | 49 ++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py index bea7f1a761658..6052a8aec443e 100644 --- a/scripts/lib/abi/abi_parser.py +++ b/scripts/lib/abi/abi_parser.py @@ -12,7 +12,6 @@ import logging import os import re -from glob import glob from pprint import pformat from random import randrange, seed @@ -46,7 +45,11 @@ class AbiParser: self.file_refs = {} self.what_refs = {} + # Ignore files that contain such suffixes + self.ignore_suffixes = (".rej", ".org", ".orig", ".bak", "~") + # Regular expressions used on parser + self.re_abi_dir = re.compile(r"(.*)" + ABI_DIR) self.re_tag = re.compile(r"(\S+)(:\s*)(.*)", re.I) self.re_valid = re.compile(self.TAGS) self.re_start_spc = re.compile(r"(\s*)(\S.*)") @@ -322,26 +325,42 @@ class AbiParser: for w in fdata.what: self.add_symbol(what=w, fname=fname, xref=fdata.key) - def parse_abi(self): - """Parse documentation ABI""" + def _parse_abi(self, root=None): + """Internal function to parse documentation ABI recursively""" - ignore_suffixes = ("rej", "org", "orig", "bak", "~") - re_abi = re.compile(r".*" + ABI_DIR) + if not root: + root = self.directory - for fname in glob(os.path.join(self.directory, "**"), recursive=True): - if os.path.isdir(fname): - continue + with os.scandir(root) as obj: + for entry in obj: + name = os.path.join(root, entry.name) - basename = os.path.basename(fname) + if entry.is_dir(): + self.parse_abi(name) + continue - if basename == "README": - continue - if basename.startswith(".") or basename.endswith(ignore_suffixes): - continue + if not entry.is_file(): + continue + + basename = os.path.basename(name) - path = re_abi.sub("", os.path.dirname(fname)) + if basename == "README": + continue + + if basename.startswith("."): + continue + + if basename.endswith(self.ignore_suffixes): + continue + + path = self.re_abi_dir.sub("", os.path.dirname(name)) + + self.parse_file(name, path, basename) + + def parse_abi(self, root=None): + """Parse documentation ABI""" - self.parse_file(fname, path, basename) + self._parse_abi(root) if self.debug & AbiDebug.DUMP_ABI_STRUCTS: self.log.debug(pformat(self.data)) -- GitLab From 9bec7870c64c00983773cfddab8d6a037f7767f3 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:03 +0100 Subject: [PATCH 184/934] scripts/lib/abi/abi_parser.py: use an interactor for ReST output Instead of printing all results line per line, use an interactor to return each variable as a separate message. This won't change much when using it via command line, but it will help Sphinx integration by providing an interactor that could be used there to handle ABI symbol by symbol. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/e3c94b8cdfd5e955aa19a703921f364a89089634.1739182025.git.mchehab+huawei@kernel.org --- scripts/get_abi.py | 3 ++- scripts/lib/abi/abi_parser.py | 48 +++++++++++++++++++---------------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/scripts/get_abi.py b/scripts/get_abi.py index 30439f21fdd00..93b973bc07ed4 100755 --- a/scripts/get_abi.py +++ b/scripts/get_abi.py @@ -62,8 +62,9 @@ class AbiRest: parser = AbiParser(args.dir, debug=args.debug) parser.parse_abi() parser.check_issues() - parser.print_data(args.enable_lineno, args.raw, not args.no_file) + for msg in parser.doc(args.enable_lineno, args.raw, not args.no_file): + print(msg) class AbiValidate: """Initialize an argparse subparser for ABI validation""" diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py index 6052a8aec443e..960e27161c263 100644 --- a/scripts/lib/abi/abi_parser.py +++ b/scripts/lib/abi/abi_parser.py @@ -336,7 +336,7 @@ class AbiParser: name = os.path.join(root, entry.name) if entry.is_dir(): - self.parse_abi(name) + self._parse_abi(name) continue if not entry.is_file(): @@ -365,14 +365,14 @@ class AbiParser: if self.debug & AbiDebug.DUMP_ABI_STRUCTS: self.log.debug(pformat(self.data)) - def print_desc_txt(self, desc): + def desc_txt(self, desc): """Print description as found inside ABI files""" desc = desc.strip(" \t\n") - print(desc + "\n") + return desc + "\n\n" - def print_desc_rst(self, desc): + def desc_rst(self, desc): """Enrich ReST output by creating cross-references""" # Remove title markups from the description @@ -425,9 +425,9 @@ class AbiParser: new_desc += d + "\n" - print(new_desc + "\n") + return new_desc + "\n\n" - def print_data(self, enable_lineno, output_in_txt, show_file=False): + def doc(self, enable_lineno, output_in_txt, show_file=False): """Print ABI at stdout""" part = None @@ -442,9 +442,11 @@ class AbiParser: if not show_file and wtype == "File": continue + msg = "" + if enable_lineno: ln = v.get("line_no", 1) - print(f".. LINENO {file_ref[0][0]}#{ln}\n") + msg += f".. LINENO {file_ref[0][0]}#{ln}\n\n" if wtype != "File": cur_part = names[0] @@ -456,9 +458,9 @@ class AbiParser: if cur_part and cur_part != part: part = cur_part - print(f"{part}\n{"-" * len(part)}\n") + msg += f"{part}\n{"-" * len(part)}\n\n" - print(f".. _{key}:\n") + msg += f".. _{key}:\n\n" max_len = 0 for i in range(0, len(names)): # pylint: disable=C0200 @@ -466,45 +468,47 @@ class AbiParser: max_len = max(max_len, len(names[i])) - print("+-" + "-" * max_len + "-+") + msg += "+-" + "-" * max_len + "-+\n" for name in names: - print(f"| {name}" + " " * (max_len - len(name)) + " |") - print("+-" + "-" * max_len + "-+") - print() + msg += f"| {name}" + " " * (max_len - len(name)) + " |\n" + msg += "+-" + "-" * max_len + "-+\n" + msg += "\n" for ref in file_ref: if wtype == "File": - print(f".. _{ref[1]}:\n") + msg += f".. _{ref[1]}:\n\n" else: base = os.path.basename(ref[0]) - print(f"Defined on file :ref:`{base} <{ref[1]}>`\n") + msg += f"Defined on file :ref:`{base} <{ref[1]}>`\n\n" if wtype == "File": - print(f"{names[0]}\n{"-" * len(names[0])}\n") + msg += f"{names[0]}\n{"-" * len(names[0])}\n\n" desc = v.get("description") if not desc and wtype != "File": - print(f"DESCRIPTION MISSING for {names[0]}\n") + msg += f"DESCRIPTION MISSING for {names[0]}\n\n" if desc: if output_in_txt: - self.print_desc_txt(desc) + msg += self.desc_txt(desc) else: - self.print_desc_rst(desc) + msg += self.desc_rst(desc) symbols = v.get("symbols") if symbols: - print("Has the following ABI:\n") + msg += "Has the following ABI:\n\n" for w, label in symbols: # Escape special chars from content content = self.re_escape.sub(r"\\\1", w) - print(f"- :ref:`{content} <{label}>`\n") + msg += f"- :ref:`{content} <{label}>`\n\n" users = v.get("users") if users and users.strip(" \t\n"): - print(f"Users:\n\t{users.strip("\n").replace('\n', '\n\t')}\n") + msg += f"Users:\n\t{users.strip("\n").replace('\n', '\n\t')}\n\n" + + yield msg def check_issues(self): """Warn about duplicated ABI entries""" -- GitLab From ee34f8300c8940758dc69f80107d9f5873c08f17 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:04 +0100 Subject: [PATCH 185/934] docs: sphinx/kernel_abi: use AbiParser directly Instead of running get_abi.py script, import AbiParser class and handle messages directly there using an interactor. This shold save some memory, as there's no need to exec python inside the Sphinx python extension. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/8dbc244dcda97112c1b694e2512a5d600e62873b.1739182025.git.mchehab+huawei@kernel.org --- Documentation/sphinx/kernel_abi.py | 29 ++++++++++++++++------------- scripts/lib/abi/abi_parser.py | 2 +- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py index f314b888d3dec..f7b22abebcf43 100644 --- a/Documentation/sphinx/kernel_abi.py +++ b/Documentation/sphinx/kernel_abi.py @@ -34,7 +34,6 @@ u""" import os import re -import subprocess import sys from docutils import nodes @@ -43,6 +42,11 @@ from docutils.parsers.rst import directives, Directive from sphinx.util.docutils import switch_source_input from sphinx.util import logging +srctree = os.path.abspath(os.environ["srctree"]) +sys.path.insert(0, os.path.join(srctree, "scripts/lib/abi")) + +from abi_parser import AbiParser + __version__ = "1.0" @@ -66,7 +70,7 @@ class KernelCmd(Directive): logger = logging.getLogger('kernel_abi') option_spec = { - "debug" : directives.flag, + "debug": directives.flag, } def run(self): @@ -74,20 +78,19 @@ class KernelCmd(Directive): if not doc.settings.file_insertion_enabled: raise self.warning("docutils: file insertion disabled") - srctree = os.path.abspath(os.environ["srctree"]) + path = os.path.join(srctree, "Documentation", self.arguments[0]) + parser = AbiParser(path, logger=self.logger) + parser.parse_abi() + parser.check_issues() - args = [ - os.path.join(srctree, 'scripts/get_abi.py'), - '-D', os.path.join(srctree, 'Documentation', self.arguments[0]), - 'rest', - '--enable-lineno', - ] + msg = "" + for m in parser.doc(enable_lineno=True, show_file=True): + msg += m - lines = subprocess.check_output(args, cwd=os.path.dirname(doc.current_source)).decode('utf-8') - nodeList = self.nestedParse(lines, self.arguments[0]) - return nodeList + node = self.nested_parse(msg, self.arguments[0]) + return node - def nestedParse(self, lines, fname): + def nested_parse(self, lines, fname): env = self.state.document.settings.env content = ViewList() node = nodes.section() diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py index 960e27161c263..57c125fd40a5c 100644 --- a/scripts/lib/abi/abi_parser.py +++ b/scripts/lib/abi/abi_parser.py @@ -427,7 +427,7 @@ class AbiParser: return new_desc + "\n\n" - def doc(self, enable_lineno, output_in_txt, show_file=False): + def doc(self, enable_lineno, output_in_txt=False, show_file=False): """Print ABI at stdout""" part = None -- GitLab From aea5e52dce74f679b91c66caad91d587d5504f6c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:05 +0100 Subject: [PATCH 186/934] docs: sphinx/kernel_abi: reduce buffer usage for ABI messages Instead of producing a big message with all ABI contents and then parse as a whole, simplify the code by handling each ABI symbol in separate. As an additional benefit, there's no need to place file/line nubers inlined at the data and use a regex to convert them. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/15be22955e3c6df49d7256c8fd24f62b397ad0ff.1739182025.git.mchehab+huawei@kernel.org --- Documentation/sphinx/kernel_abi.py | 82 +++++++++++++++--------------- scripts/get_abi.py | 7 ++- scripts/lib/abi/abi_parser.py | 10 ++-- 3 files changed, 51 insertions(+), 48 deletions(-) diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py index f7b22abebcf43..742ebd35454f6 100644 --- a/Documentation/sphinx/kernel_abi.py +++ b/Documentation/sphinx/kernel_abi.py @@ -68,6 +68,7 @@ class KernelCmd(Directive): has_content = False final_argument_whitespace = True logger = logging.getLogger('kernel_abi') + parser = None option_spec = { "debug": directives.flag, @@ -79,59 +80,60 @@ class KernelCmd(Directive): raise self.warning("docutils: file insertion disabled") path = os.path.join(srctree, "Documentation", self.arguments[0]) - parser = AbiParser(path, logger=self.logger) - parser.parse_abi() - parser.check_issues() + self.parser = AbiParser(path, logger=self.logger) + self.parser.parse_abi() + self.parser.check_issues() - msg = "" - for m in parser.doc(enable_lineno=True, show_file=True): - msg += m - - node = self.nested_parse(msg, self.arguments[0]) + node = self.nested_parse(None, self.arguments[0]) return node - def nested_parse(self, lines, fname): + def nested_parse(self, data, fname): env = self.state.document.settings.env content = ViewList() node = nodes.section() - if "debug" in self.options: - code_block = "\n\n.. code-block:: rst\n :linenos:\n" - for line in lines.split("\n"): - code_block += "\n " + line - lines = code_block + "\n\n" - - line_regex = re.compile(r"^\.\. LINENO (\S+)\#([0-9]+)$") - ln = 0 - n = 0 - f = fname - - for line in lines.split("\n"): - n = n + 1 - match = line_regex.search(line) - if match: - new_f = match.group(1) - - # Sphinx parser is lazy: it stops parsing contents in the - # middle, if it is too big. So, handle it per input file - if new_f != f and content: - self.do_parse(content, node) - content = ViewList() + if data is not None: + # Handles the .rst file + for line in data.split("\n"): + content.append(line, fname, 0) + + self.do_parse(content, node) + else: + # Handles the ABI parser content, symbol by symbol + + old_f = fname + n = 0 + for msg, f, ln in self.parser.doc(): + msg_list = msg.split("\n") + if "debug" in self.options: + lines = [ + "", "", ".. code-block:: rst", + " :linenos:", "" + ] + for m in msg_list: + lines.append(" " + m) + else: + lines = msg_list + + for line in lines: + # sphinx counts lines from 0 + content.append(line, f, ln - 1) + n += 1 + + if f != old_f: # Add the file to Sphinx build dependencies env.note_dependency(os.path.abspath(f)) - f = new_f + old_f = f - # sphinx counts lines from 0 - ln = int(match.group(2)) - 1 - else: - content.append(line, f, ln) - - self.logger.info("%s: parsed %i lines" % (fname, n)) + # Sphinx doesn't like to parse big messages. So, let's + # add content symbol by symbol + if content: + self.do_parse(content, node) + content = ViewList() - if content: - self.do_parse(content, node) + self.logger.info("%s: parsed %i lines" % (fname, n)) return node.children diff --git a/scripts/get_abi.py b/scripts/get_abi.py index 93b973bc07ed4..19f78d6aa4079 100755 --- a/scripts/get_abi.py +++ b/scripts/get_abi.py @@ -63,8 +63,11 @@ class AbiRest: parser.parse_abi() parser.check_issues() - for msg in parser.doc(args.enable_lineno, args.raw, not args.no_file): - print(msg) + for t in parser.doc(args.raw, not args.no_file): + if args.enable_lineno: + print (f".. LINENO {t[1]}#{t[2]}\n\n") + + print(t[0]) class AbiValidate: """Initialize an argparse subparser for ABI validation""" diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py index 57c125fd40a5c..1db6c54fc65ad 100644 --- a/scripts/lib/abi/abi_parser.py +++ b/scripts/lib/abi/abi_parser.py @@ -427,7 +427,7 @@ class AbiParser: return new_desc + "\n\n" - def doc(self, enable_lineno, output_in_txt=False, show_file=False): + def doc(self, output_in_txt=False, show_file=True): """Print ABI at stdout""" part = None @@ -444,10 +444,6 @@ class AbiParser: msg = "" - if enable_lineno: - ln = v.get("line_no", 1) - msg += f".. LINENO {file_ref[0][0]}#{ln}\n\n" - if wtype != "File": cur_part = names[0] if cur_part.find("/") >= 0: @@ -508,7 +504,9 @@ class AbiParser: if users and users.strip(" \t\n"): msg += f"Users:\n\t{users.strip("\n").replace('\n', '\n\t')}\n\n" - yield msg + ln = v.get("line_no", 1) + + yield (msg, file_ref[0][0], ln) def check_issues(self): """Warn about duplicated ABI entries""" -- GitLab From cc93e4829a14339575383fb3cd7d3e858faabc2a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:06 +0100 Subject: [PATCH 187/934] docs: sphinx/kernel_abi: properly split lines Sphinx doesn't like to have lines split with str.split("\n"). Instead, it uses its own splitter, with handles line breaks the way Spinx expects. Not using it cause issues at the output files. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/d4ad5b977799616544376210364d5cec686119ef.1739182025.git.mchehab+huawei@kernel.org --- Documentation/sphinx/kernel_abi.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py index 742ebd35454f6..0a40571832080 100644 --- a/Documentation/sphinx/kernel_abi.py +++ b/Documentation/sphinx/kernel_abi.py @@ -36,7 +36,7 @@ import os import re import sys -from docutils import nodes +from docutils import nodes, statemachine from docutils.statemachine import ViewList from docutils.parsers.rst import directives, Directive from sphinx.util.docutils import switch_source_input @@ -105,7 +105,8 @@ class KernelCmd(Directive): old_f = fname n = 0 for msg, f, ln in self.parser.doc(): - msg_list = msg.split("\n") + msg_list = statemachine.string2lines(msg, tab_width, + convert_whitespace=True) if "debug" in self.options: lines = [ "", "", ".. code-block:: rst", -- GitLab From 2a21d80dfb4135b4766d8ff3231a3ea1c19bcc83 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:07 +0100 Subject: [PATCH 188/934] scripts/get_abi.pl: Add filtering capabilities to rest output This way, Sphinx ABI extension can parse symbols only once, while keep displaying results in separate files. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/41e108e816e46434aa596e5c0d25d227cb9f0fe5.1739182025.git.mchehab+huawei@kernel.org --- scripts/lib/abi/abi_parser.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py index 1db6c54fc65ad..b20d5c9d920e0 100644 --- a/scripts/lib/abi/abi_parser.py +++ b/scripts/lib/abi/abi_parser.py @@ -160,6 +160,7 @@ class AbiParser: self.data[fdata.key] = { "what": [content], "file": [fdata.file_ref], + "path": fdata.ftype, "line_no": fdata.ln, } @@ -182,8 +183,6 @@ class AbiParser: if new_what: fdata.label = "" - self.data[fdata.key]["type"] = fdata.ftype - if "description" in self.data[fdata.key]: self.data[fdata.key]["description"] += "\n\n" @@ -299,6 +298,7 @@ class AbiParser: fdata.nametag = {} fdata.nametag["what"] = [f"File {path}/{basename}"] fdata.nametag["type"] = "File" + fdata.nametag["path"] = fdata.ftype fdata.nametag["file"] = [fdata.file_ref] fdata.nametag["line_no"] = 1 fdata.nametag["description"] = "" @@ -427,7 +427,8 @@ class AbiParser: return new_desc + "\n\n" - def doc(self, output_in_txt=False, show_file=True): + def doc(self, output_in_txt=False, show_symbols=True, show_file=True, + filter_path=None): """Print ABI at stdout""" part = None @@ -435,12 +436,20 @@ class AbiParser: key=lambda x: (x[1].get("type", ""), x[1].get("what"))): - wtype = v.get("type", "Var") + wtype = v.get("type", "Symbol") file_ref = v.get("file") names = v.get("what", [""]) - if not show_file and wtype == "File": - continue + if wtype == "File": + if not show_file: + continue + else: + if not show_symbols: + continue + + if filter_path: + if v.get("path") != filter_path: + continue msg = "" -- GitLab From 98a4324a8b7bbe433483c90524026be0ccc9ffa8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:08 +0100 Subject: [PATCH 189/934] scripts/get_abi.pl: add support to parse ABI README file The Documentation/ABI/README file is currently outside the documentation tree. Yet, it may still provide some useful information. Add it to the documentation parsing. As a plus, this avoids a warning when detecting missing cross-references. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/f1285dedfe4d0eb0f0af34f6a68bee6fde36dd7d.1739182025.git.mchehab+huawei@kernel.org --- scripts/lib/abi/abi_parser.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py index b20d5c9d920e0..6fac461d794ce 100644 --- a/scripts/lib/abi/abi_parser.py +++ b/scripts/lib/abi/abi_parser.py @@ -263,6 +263,16 @@ class AbiParser: if content: self.warn(fdata, "Unexpected content", line) + def parse_readme(self, nametag, fname): + """Parse ABI README file""" + + with open(fname, "r", encoding="utf8", errors="backslashreplace") as fp: + nametag["description"] = "```\n" + for line in fp: + nametag["description"] += " " + line + + nametag["description"] += "```\n" + def parse_file(self, fname, path, basename): """Parse a single file""" @@ -309,6 +319,10 @@ class AbiParser: if self.debug & AbiDebug.WHAT_OPEN: self.log.debug("Opening file %s", fname) + if basename == "README": + self.parse_readme(fdata.nametag, fname) + return + with open(fname, "r", encoding="utf8", errors="backslashreplace") as fp: for line in fp: fdata.ln += 1 @@ -344,9 +358,6 @@ class AbiParser: basename = os.path.basename(name) - if basename == "README": - continue - if basename.startswith("."): continue @@ -448,8 +459,12 @@ class AbiParser: continue if filter_path: - if v.get("path") != filter_path: - continue + if filter_path == "README": + if not names[0].endswith("README"): + continue + else: + if v.get("path") != filter_path: + continue msg = "" -- GitLab From 5d7871d77f6d62406b3d459a58810c1ddb8904c2 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:09 +0100 Subject: [PATCH 190/934] docs: sphinx/kernel_abi: parse ABI files only once Right now, the logic parses ABI files on 4 steps, one for each directory. While this is fine in principle, by doing that, not all symbol cross-references will be created. Change the logic to do the parsing only once in order to get a global dictionary to be used when creating ABI cross-references. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/5205c53838b6ea25f4cdd4cc1e3d17c0141e75a6.1739182025.git.mchehab+huawei@kernel.org --- Documentation/admin-guide/abi-obsolete.rst | 2 +- Documentation/admin-guide/abi-removed.rst | 2 +- Documentation/admin-guide/abi-stable.rst | 2 +- Documentation/admin-guide/abi-testing.rst | 2 +- Documentation/sphinx/kernel_abi.py | 115 ++++++++++++--------- scripts/lib/abi/abi_parser.py | 22 ++-- 6 files changed, 81 insertions(+), 64 deletions(-) diff --git a/Documentation/admin-guide/abi-obsolete.rst b/Documentation/admin-guide/abi-obsolete.rst index 6d4d9ab7b8c3d..bdef91d2cea4a 100644 --- a/Documentation/admin-guide/abi-obsolete.rst +++ b/Documentation/admin-guide/abi-obsolete.rst @@ -9,4 +9,4 @@ marked to be removed at some later point in time. The description of the interface will document the reason why it is obsolete and when it can be expected to be removed. -.. kernel-abi:: ABI/obsolete +.. kernel-abi:: obsolete diff --git a/Documentation/admin-guide/abi-removed.rst b/Documentation/admin-guide/abi-removed.rst index 9fc78af6f0771..bea0608b84428 100644 --- a/Documentation/admin-guide/abi-removed.rst +++ b/Documentation/admin-guide/abi-removed.rst @@ -3,4 +3,4 @@ ABI removed symbols =================== -.. kernel-abi:: ABI/removed +.. kernel-abi:: removed diff --git a/Documentation/admin-guide/abi-stable.rst b/Documentation/admin-guide/abi-stable.rst index c47c2a295865b..33637c0d4fd50 100644 --- a/Documentation/admin-guide/abi-stable.rst +++ b/Documentation/admin-guide/abi-stable.rst @@ -12,4 +12,4 @@ for at least 2 years. Most interfaces (like syscalls) are expected to never change and always be available. -.. kernel-abi:: ABI/stable +.. kernel-abi:: stable diff --git a/Documentation/admin-guide/abi-testing.rst b/Documentation/admin-guide/abi-testing.rst index 40b31985e5874..55054985a8ffb 100644 --- a/Documentation/admin-guide/abi-testing.rst +++ b/Documentation/admin-guide/abi-testing.rst @@ -18,4 +18,4 @@ Programs that use these interfaces are strongly encouraged to add their name to the description of these interfaces, so that the kernel developers can easily notify them if any changes occur. -.. kernel-abi:: ABI/testing +.. kernel-abi:: testing diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py index 0a40571832080..964f586de171d 100644 --- a/Documentation/sphinx/kernel_abi.py +++ b/Documentation/sphinx/kernel_abi.py @@ -49,6 +49,13 @@ from abi_parser import AbiParser __version__ = "1.0" +logger = logging.getLogger('kernel_abi') +path = os.path.join(srctree, "Documentation/ABI") + +# Parse ABI symbols only once +kernel_abi = AbiParser(path, logger=logger) +kernel_abi.parse_abi() +kernel_abi.check_issues() def setup(app): @@ -64,14 +71,15 @@ class KernelCmd(Directive): u"""KernelABI (``kernel-abi``) directive""" required_arguments = 1 - optional_arguments = 2 + optional_arguments = 3 has_content = False final_argument_whitespace = True - logger = logging.getLogger('kernel_abi') parser = None option_spec = { "debug": directives.flag, + "no-symbols": directives.flag, + "no-files": directives.flag, } def run(self): @@ -79,62 +87,67 @@ class KernelCmd(Directive): if not doc.settings.file_insertion_enabled: raise self.warning("docutils: file insertion disabled") - path = os.path.join(srctree, "Documentation", self.arguments[0]) - self.parser = AbiParser(path, logger=self.logger) - self.parser.parse_abi() - self.parser.check_issues() - - node = self.nested_parse(None, self.arguments[0]) - return node - - def nested_parse(self, data, fname): env = self.state.document.settings.env content = ViewList() node = nodes.section() - if data is not None: - # Handles the .rst file - for line in data.split("\n"): - content.append(line, fname, 0) + abi_type = self.arguments[0] - self.do_parse(content, node) + if "no-symbols" in self.options: + show_symbols = False + else: + show_symbols = True + if "no-files" in self.options: + show_file = False + else: + show_file = True + + tab_width = self.options.get('tab-width', + self.state.document.settings.tab_width) + + old_f = None + n = 0 + n_sym = 0 + for msg, f, ln in kernel_abi.doc(show_file=show_file, + show_symbols=show_symbols, + filter_path=abi_type): + n_sym += 1 + msg_list = statemachine.string2lines(msg, tab_width, + convert_whitespace=True) + if "debug" in self.options: + lines = [ + "", "", ".. code-block:: rst", + " :linenos:", "" + ] + for m in msg_list: + lines.append(" " + m) + else: + lines = msg_list + + for line in lines: + # sphinx counts lines from 0 + content.append(line, f, ln - 1) + n += 1 + + if f != old_f: + # Add the file to Sphinx build dependencies + env.note_dependency(os.path.abspath(f)) + + old_f = f + + # Sphinx doesn't like to parse big messages. So, let's + # add content symbol by symbol + if content: + self.do_parse(content, node) + content = ViewList() + + if show_symbols and not show_file: + logger.verbose("%s ABI: %i symbols (%i ReST lines)" % (abi_type, n_sym, n)) + elif not show_symbols and show_file: + logger.verbose("%s ABI: %i files (%i ReST lines)" % (abi_type, n_sym, n)) else: - # Handles the ABI parser content, symbol by symbol - - old_f = fname - n = 0 - for msg, f, ln in self.parser.doc(): - msg_list = statemachine.string2lines(msg, tab_width, - convert_whitespace=True) - if "debug" in self.options: - lines = [ - "", "", ".. code-block:: rst", - " :linenos:", "" - ] - for m in msg_list: - lines.append(" " + m) - else: - lines = msg_list - - for line in lines: - # sphinx counts lines from 0 - content.append(line, f, ln - 1) - n += 1 - - if f != old_f: - # Add the file to Sphinx build dependencies - env.note_dependency(os.path.abspath(f)) - - old_f = f - - # Sphinx doesn't like to parse big messages. So, let's - # add content symbol by symbol - if content: - self.do_parse(content, node) - content = ViewList() - - self.logger.info("%s: parsed %i lines" % (fname, n)) + logger.verbose("%s ABI: %i data (%i ReST lines)" % (abi_type, n_sym, n)) return node.children diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py index 6fac461d794ce..87d1b9e14bb3e 100644 --- a/scripts/lib/abi/abi_parser.py +++ b/scripts/lib/abi/abi_parser.py @@ -266,12 +266,20 @@ class AbiParser: def parse_readme(self, nametag, fname): """Parse ABI README file""" + nametag["what"] = ["ABI file contents"] + nametag["path"] = "README" with open(fname, "r", encoding="utf8", errors="backslashreplace") as fp: - nametag["description"] = "```\n" for line in fp: - nametag["description"] += " " + line + match = self.re_tag.match(line) + if match: + new = match.group(1).lower() + + match = self.re_valid.search(new) + if match: + nametag["description"] += "\n:" + line + continue - nametag["description"] += "```\n" + nametag["description"] += line def parse_file(self, fname, path, basename): """Parse a single file""" @@ -459,12 +467,8 @@ class AbiParser: continue if filter_path: - if filter_path == "README": - if not names[0].endswith("README"): - continue - else: - if v.get("path") != filter_path: - continue + if v.get("path") != filter_path: + continue msg = "" -- GitLab From 4bb2dbd7576d38a5789692f2b87003fa99138090 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:10 +0100 Subject: [PATCH 191/934] docs: admin-guide/abi: split files from symbols Now that get_abi has gained support for filtering its output, split ABI symbols from files at the html output. That makes pages smaller and easier to navigate. As an additional bonus, as it will paralelize files handling, it gives an additional performance improvement. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/30e3cf2a8aeef23ca889de60a90f7de141e0dc0e.1739182025.git.mchehab+huawei@kernel.org --- Documentation/admin-guide/abi-obsolete-files.rst | 7 +++++++ Documentation/admin-guide/abi-obsolete.rst | 1 + Documentation/admin-guide/abi-readme-file.rst | 6 ++++++ Documentation/admin-guide/abi-removed-files.rst | 7 +++++++ Documentation/admin-guide/abi-removed.rst | 1 + Documentation/admin-guide/abi-stable-files.rst | 7 +++++++ Documentation/admin-guide/abi-stable.rst | 1 + Documentation/admin-guide/abi-testing-files.rst | 7 +++++++ Documentation/admin-guide/abi-testing.rst | 1 + Documentation/admin-guide/abi.rst | 15 +++++++++++++++ 10 files changed, 53 insertions(+) create mode 100644 Documentation/admin-guide/abi-obsolete-files.rst create mode 100644 Documentation/admin-guide/abi-readme-file.rst create mode 100644 Documentation/admin-guide/abi-removed-files.rst create mode 100644 Documentation/admin-guide/abi-stable-files.rst create mode 100644 Documentation/admin-guide/abi-testing-files.rst diff --git a/Documentation/admin-guide/abi-obsolete-files.rst b/Documentation/admin-guide/abi-obsolete-files.rst new file mode 100644 index 0000000000000..3061a916b4b56 --- /dev/null +++ b/Documentation/admin-guide/abi-obsolete-files.rst @@ -0,0 +1,7 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Obsolete ABI Files +================== + +.. kernel-abi:: obsolete + :no-symbols: diff --git a/Documentation/admin-guide/abi-obsolete.rst b/Documentation/admin-guide/abi-obsolete.rst index bdef91d2cea4a..640f3903e8470 100644 --- a/Documentation/admin-guide/abi-obsolete.rst +++ b/Documentation/admin-guide/abi-obsolete.rst @@ -10,3 +10,4 @@ The description of the interface will document the reason why it is obsolete and when it can be expected to be removed. .. kernel-abi:: obsolete + :no-files: diff --git a/Documentation/admin-guide/abi-readme-file.rst b/Documentation/admin-guide/abi-readme-file.rst new file mode 100644 index 0000000000000..6172e4ccbda2a --- /dev/null +++ b/Documentation/admin-guide/abi-readme-file.rst @@ -0,0 +1,6 @@ +.. SPDX-License-Identifier: GPL-2.0 + +ABI README +========== + +.. kernel-abi:: README diff --git a/Documentation/admin-guide/abi-removed-files.rst b/Documentation/admin-guide/abi-removed-files.rst new file mode 100644 index 0000000000000..f1bdfadd2ec45 --- /dev/null +++ b/Documentation/admin-guide/abi-removed-files.rst @@ -0,0 +1,7 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Removed ABI Files +================= + +.. kernel-abi:: removed + :no-symbols: diff --git a/Documentation/admin-guide/abi-removed.rst b/Documentation/admin-guide/abi-removed.rst index bea0608b84428..88832d3eacd6e 100644 --- a/Documentation/admin-guide/abi-removed.rst +++ b/Documentation/admin-guide/abi-removed.rst @@ -4,3 +4,4 @@ ABI removed symbols =================== .. kernel-abi:: removed + :no-files: diff --git a/Documentation/admin-guide/abi-stable-files.rst b/Documentation/admin-guide/abi-stable-files.rst new file mode 100644 index 0000000000000..f867738fc178a --- /dev/null +++ b/Documentation/admin-guide/abi-stable-files.rst @@ -0,0 +1,7 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Stable ABI Files +================ + +.. kernel-abi:: stable + :no-symbols: diff --git a/Documentation/admin-guide/abi-stable.rst b/Documentation/admin-guide/abi-stable.rst index 33637c0d4fd50..528c68401f4b9 100644 --- a/Documentation/admin-guide/abi-stable.rst +++ b/Documentation/admin-guide/abi-stable.rst @@ -13,3 +13,4 @@ Most interfaces (like syscalls) are expected to never change and always be available. .. kernel-abi:: stable + :no-files: diff --git a/Documentation/admin-guide/abi-testing-files.rst b/Documentation/admin-guide/abi-testing-files.rst new file mode 100644 index 0000000000000..1da868e42fdb8 --- /dev/null +++ b/Documentation/admin-guide/abi-testing-files.rst @@ -0,0 +1,7 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Testing ABI Files +================= + +.. kernel-abi:: testing + :no-symbols: diff --git a/Documentation/admin-guide/abi-testing.rst b/Documentation/admin-guide/abi-testing.rst index 55054985a8ffb..6153ebd38e2d0 100644 --- a/Documentation/admin-guide/abi-testing.rst +++ b/Documentation/admin-guide/abi-testing.rst @@ -19,3 +19,4 @@ name to the description of these interfaces, so that the kernel developers can easily notify them if any changes occur. .. kernel-abi:: testing + :no-files: diff --git a/Documentation/admin-guide/abi.rst b/Documentation/admin-guide/abi.rst index 64e772bde943d..15a2dcb1388ca 100644 --- a/Documentation/admin-guide/abi.rst +++ b/Documentation/admin-guide/abi.rst @@ -4,6 +4,9 @@ Linux ABI description ===================== +ABI symbols +----------- + .. toctree:: :maxdepth: 2 @@ -11,3 +14,15 @@ Linux ABI description abi-testing abi-obsolete abi-removed + +ABI files +--------- + +.. toctree:: + :maxdepth: 2 + + abi-readme-file + abi-stable-files + abi-testing-files + abi-obsolete-files + abi-removed-files -- GitLab From c940816968da6ef9a9462b7c070cc333d609a16c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:11 +0100 Subject: [PATCH 192/934] docs: sphinx/automarkup: add cross-references for ABI Now that all ABI files are handled together, we can add a feature at automarkup for it to generate cross-references for ABI symbols. The cross-reference logic can produce references for all existing files, except for README (as this is not parsed). For symbols, they need to be an exact match of what it is described at the docs, which is not always true due to wildcards. If symbols at /sys /proc and /config are identical, a cross-reference will be used. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/0b97a51b68b1c20127ad4a6a55658557fe0848d0.1739182025.git.mchehab+huawei@kernel.org --- Documentation/sphinx/automarkup.py | 45 ++++++++++++++++++++++++++++++ scripts/lib/abi/abi_parser.py | 11 ++++++++ 2 files changed, 56 insertions(+) diff --git a/Documentation/sphinx/automarkup.py b/Documentation/sphinx/automarkup.py index a413f8dd51158..7d91c39b4ca67 100644 --- a/Documentation/sphinx/automarkup.py +++ b/Documentation/sphinx/automarkup.py @@ -11,6 +11,8 @@ from sphinx.errors import NoUri import re from itertools import chain +from kernel_abi import kernel_abi + # # Python 2 lacks re.ASCII... # @@ -48,6 +50,8 @@ RE_typedef = re.compile(r'\b(typedef)\s+([a-zA-Z_]\w+)', flags=ascii_p3) # an optional extension # RE_doc = re.compile(r'(\bDocumentation/)?((\.\./)*[\w\-/]+)\.(rst|txt)') +RE_abi_file = re.compile(r'(\bDocumentation/ABI/[\w\-/]+)') +RE_abi_symbol = re.compile(r'(\b/(sys|config|proc)/[\w\-/]+)') RE_namespace = re.compile(r'^\s*..\s*c:namespace::\s*(\S+)\s*$') @@ -84,10 +88,14 @@ def markup_refs(docname, app, node): # Associate each regex with the function that will markup its matches # markup_func_sphinx2 = {RE_doc: markup_doc_ref, + RE_abi_file: markup_abi_ref, + RE_abi_symbol: markup_abi_ref, RE_function: markup_c_ref, RE_generic_type: markup_c_ref} markup_func_sphinx3 = {RE_doc: markup_doc_ref, + RE_abi_file: markup_abi_ref, + RE_abi_symbol: markup_abi_ref, RE_function: markup_func_ref_sphinx3, RE_struct: markup_c_ref, RE_union: markup_c_ref, @@ -270,6 +278,43 @@ def markup_doc_ref(docname, app, match): else: return nodes.Text(match.group(0)) +# +# Try to replace a documentation reference of the form Documentation/ABI/... +# with a cross reference to that page +# +def markup_abi_ref(docname, app, match): + stddom = app.env.domains['std'] + # + # Go through the dance of getting an xref out of the std domain + # + fname = match.group(1) + target = kernel_abi.xref(fname) + + # Kernel ABI doesn't describe such file or symbol + if not target: + return nodes.Text(match.group(0)) + + pxref = addnodes.pending_xref('', refdomain = 'std', reftype = 'ref', + reftarget = target, modname = None, + classname = None, refexplicit = False) + + # + # XXX The Latex builder will throw NoUri exceptions here, + # work around that by ignoring them. + # + try: + xref = stddom.resolve_xref(app.env, docname, app.builder, 'ref', + target, pxref, None) + except NoUri: + xref = None + # + # Return the xref if we got it; otherwise just return the plain text. + # + if xref: + return xref + else: + return nodes.Text(match.group(0)) + def get_c_namespace(app, docname): source = app.env.doc2path(docname) with open(source) as f: diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py index 87d1b9e14bb3e..3b1ab4c0bdd75 100644 --- a/scripts/lib/abi/abi_parser.py +++ b/scripts/lib/abi/abi_parser.py @@ -391,6 +391,17 @@ class AbiParser: return desc + "\n\n" + def xref(self, fname): + """ + Converts a Documentation/ABI + basename into a ReST cross-reference + """ + + xref = self.file_refs.get(fname) + if not xref: + return None + else: + return xref + def desc_rst(self, desc): """Enrich ReST output by creating cross-references""" -- GitLab From 5ca0e7ffc898ad43bbd1360adf50398fa817dbc9 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:12 +0100 Subject: [PATCH 193/934] docs: sphinx/kernel_abi: avoid warnings during Sphinx module init Sphinx logging system doesn't like warnings during module load, as it understands that such logs are produced at the wrong time: WARNING: while setting up extension automarkup: /sys/devices/system/cpu/cpuX/topology/physical_package_id is defined 2 times: /new_devel/v4l/docs/Documentation/ABI/stable/sysfs-devices-system-cpu:27; /new_devel/v4l/docs/Documentation/ABI/testing/sysfs-devices-system-cpu:70 WARNING: while setting up extension automarkup: /sys/devices/system/cpu/cpuX/topology/ppin is defined 2 times: /new_devel/v4l/docs/Documentation/ABI/stable/sysfs-devices-system-cpu:89; /new_devel/v4l/docs/Documentation/ABI/testing/sysfs-devices-system-cpu:70 So, use a function to allocate/process ABI files and use it to be called at kernel_abi.py, as automarkup also needs it to produce the right cross-references. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/b0e79dc60d556e3b39fa6774d3b7bf734b73f352.1739182025.git.mchehab+huawei@kernel.org --- Documentation/sphinx/automarkup.py | 4 +++- Documentation/sphinx/kernel_abi.py | 23 +++++++++++++++++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/Documentation/sphinx/automarkup.py b/Documentation/sphinx/automarkup.py index 7d91c39b4ca67..22defc18d6d2c 100644 --- a/Documentation/sphinx/automarkup.py +++ b/Documentation/sphinx/automarkup.py @@ -11,7 +11,7 @@ from sphinx.errors import NoUri import re from itertools import chain -from kernel_abi import kernel_abi +from kernel_abi import get_kernel_abi # # Python 2 lacks re.ASCII... @@ -287,6 +287,8 @@ def markup_abi_ref(docname, app, match): # # Go through the dance of getting an xref out of the std domain # + kernel_abi = get_kernel_abi() + fname = match.group(1) target = kernel_abi.xref(fname) diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py index 964f586de171d..e017b02999532 100644 --- a/Documentation/sphinx/kernel_abi.py +++ b/Documentation/sphinx/kernel_abi.py @@ -52,10 +52,23 @@ __version__ = "1.0" logger = logging.getLogger('kernel_abi') path = os.path.join(srctree, "Documentation/ABI") -# Parse ABI symbols only once -kernel_abi = AbiParser(path, logger=logger) -kernel_abi.parse_abi() -kernel_abi.check_issues() +_kernel_abi = None + +def get_kernel_abi(): + u""" + Initialize kernel_abi global var, if not initialized yet. + + This is needed to avoid warnings during Sphinx module initialization. + """ + global _kernel_abi + + if not _kernel_abi: + # Parse ABI symbols only once + _kernel_abi = AbiParser(path, logger=logger) + _kernel_abi.parse_abi() + _kernel_abi.check_issues() + + return _kernel_abi def setup(app): @@ -83,6 +96,8 @@ class KernelCmd(Directive): } def run(self): + kernel_abi = get_kernel_abi() + doc = self.state.document if not doc.settings.file_insertion_enabled: raise self.warning("docutils: file insertion disabled") -- GitLab From dc525a7650d70668c4d54cf03b4cba793b72cb5a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:13 +0100 Subject: [PATCH 194/934] scripts/lib/abi/abi_parser.py: Rename title name for ABI files This makes them look better when generating cross-references. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/e44574cb2796861d6acbce839068ed3ef385d16c.1739182025.git.mchehab+huawei@kernel.org --- scripts/lib/abi/abi_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py index 3b1ab4c0bdd75..0c3837e52afa6 100644 --- a/scripts/lib/abi/abi_parser.py +++ b/scripts/lib/abi/abi_parser.py @@ -314,7 +314,7 @@ class AbiParser: fdata.ftype = path.split("/")[0] fdata.nametag = {} - fdata.nametag["what"] = [f"File {path}/{basename}"] + fdata.nametag["what"] = [f"ABI file {path}/{basename}"] fdata.nametag["type"] = "File" fdata.nametag["path"] = fdata.ftype fdata.nametag["file"] = [fdata.file_ref] -- GitLab From 6649b4217089c5d17dc210946baf9c9537c7fb5d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:14 +0100 Subject: [PATCH 195/934] scripts/lib/abi/abi_parser.py: make it backward-compatible with Python 3.6 Despite being introduced on Python 3.6, the original implementation was too limited: it doesn't accept anything but the argument. Even on python 3.10.12, support was still limited, as more complex operations cause SyntaxError: Exception occurred: File ".../linux/Documentation/sphinx/kernel_abi.py", line 48, in from get_abi import AbiParser File ".../linux/scripts/lib/abi/abi_parser.py", line 525 msg += f"{part}\n{"-" * len(part)}\n\n" ^ SyntaxError: f-string: expecting '}' Replace f-strings by normal string concatenation when it doesn't work on Python 3.6. Reported-by: Akira Yokosawa Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/41d2f85df134a46db46fed73a0f9697a3d2ae9ba.1739182025.git.mchehab+huawei@kernel.org --- scripts/lib/abi/abi_parser.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py index 0c3837e52afa6..f08de6d3bf7c1 100644 --- a/scripts/lib/abi/abi_parser.py +++ b/scripts/lib/abi/abi_parser.py @@ -493,7 +493,7 @@ class AbiParser: if cur_part and cur_part != part: part = cur_part - msg += f"{part}\n{"-" * len(part)}\n\n" + msg += part + "\n"+ "-" * len(part) +"\n\n" msg += f".. _{key}:\n\n" @@ -517,7 +517,7 @@ class AbiParser: msg += f"Defined on file :ref:`{base} <{ref[1]}>`\n\n" if wtype == "File": - msg += f"{names[0]}\n{"-" * len(names[0])}\n\n" + msg += names[0] +"\n" + "-" * len(names[0]) +"\n\n" desc = v.get("description") if not desc and wtype != "File": @@ -541,7 +541,8 @@ class AbiParser: users = v.get("users") if users and users.strip(" \t\n"): - msg += f"Users:\n\t{users.strip("\n").replace('\n', '\n\t')}\n\n" + users = users.strip("\n").replace('\n', '\n\t') + msg += f"Users:\n\t{users}\n\n" ln = v.get("line_no", 1) @@ -567,7 +568,9 @@ class AbiParser: elif len(lines) == 1: f.append(f"{fname}:{lines[0]}") else: - f.append(f"{fname} lines {", ".join(str(x) for x in lines)}") + m = fname + "lines " + m += ", ".join(str(x) for x in lines) + f.append(m) self.log.warning("%s is defined %d times: %s", what, len(f), "; ".join(f)) @@ -615,10 +618,11 @@ class AbiParser: if users: print(f"Users:\t\t\t{users}") - print(f"Defined on file{'s'[:len(files) ^ 1]}:\t{", ".join(files)}") + print("Defined on file(s):\t" + ", ".join(files)) if desc: - print(f"\n{desc.strip("\n")}\n") + desc = desc.strip("\n") + print(f"\n{desc}\n") if not found_keys: print(f"Regular expression /{expr}/ not found.") -- GitLab From 0d5fd96880d9135a4b35fb5523896b21b13dde78 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:15 +0100 Subject: [PATCH 196/934] scripts/get_abi.py: add support for undefined ABIs The undefined logic is complex and has lots of magic on it. Implement it, using the same algorithm we have at get_abi.pl. Yet, some tweaks to optimize performance and to make the code simpler were added here: - at the perl version, the tree graph had loops, so we had to use BFS to traverse it. On this version, the graph is a tree, so, it simplifies the what group for sysfs aliases; - the logic which splits regular expressions into subgroups was re-written to make it faster; - it may optionally use multiple processes to search for symbol matches; - it has some additional debug levels. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/1529c255845d117696d5af57d8dc05554663afdf.1739182025.git.mchehab+huawei@kernel.org --- scripts/get_abi.py | 68 ++++++ scripts/lib/abi/abi_regex.py | 234 ++++++++++++++++++ scripts/lib/abi/helpers.py | 16 +- scripts/lib/abi/system_symbols.py | 378 ++++++++++++++++++++++++++++++ 4 files changed, 693 insertions(+), 3 deletions(-) create mode 100644 scripts/lib/abi/abi_regex.py create mode 100644 scripts/lib/abi/system_symbols.py diff --git a/scripts/get_abi.py b/scripts/get_abi.py index 19f78d6aa4079..7ce4748a46d20 100755 --- a/scripts/get_abi.py +++ b/scripts/get_abi.py @@ -20,7 +20,9 @@ SRC_DIR = os.path.dirname(os.path.realpath(__file__)) sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR)) from abi_parser import AbiParser # pylint: disable=C0413 +from abi_regex import AbiRegex # pylint: disable=C0413 from helpers import ABI_DIR, DEBUG_HELP # pylint: disable=C0413 +from system_symbols import SystemSymbols # pylint: disable=C0413 # Command line classes @@ -111,6 +113,71 @@ class AbiSearch: parser.parse_abi() parser.search_symbols(args.expression) +UNDEFINED_DESC=""" +Check undefined ABIs on local machine. + +Read sysfs devnodes and check if the devnodes there are defined inside +ABI documentation. + +The search logic tries to minimize the number of regular expressions to +search per each symbol. + +By default, it runs on a single CPU, as Python support for CPU threads +is still experimental, and multi-process runs on Python is very slow. + +On experimental tests, if the number of ABI symbols to search per devnode +is contained on a limit of ~150 regular expressions, using a single CPU +is a lot faster than using multiple processes. However, if the number of +regular expressions to check is at the order of ~30000, using multiple +CPUs speeds up the check. +""" + +class AbiUndefined: + """ + Initialize an argparse subparser for logic to check undefined ABI at + the current machine's sysfs + """ + + def __init__(self, subparsers): + """Initialize argparse subparsers""" + + parser = subparsers.add_parser("undefined", + formatter_class=argparse.RawTextHelpFormatter, + description=UNDEFINED_DESC) + + parser.add_argument("-S", "--sysfs-dir", default="/sys", + help="directory where sysfs is mounted") + parser.add_argument("-s", "--search-string", + help="search string regular expression to limit symbol search") + parser.add_argument("-H", "--show-hints", action="store_true", + help="Hints about definitions for missing ABI symbols.") + parser.add_argument("-j", "--jobs", "--max-workers", type=int, default=1, + help="If bigger than one, enables multiprocessing.") + parser.add_argument("-c", "--max-chunk-size", type=int, default=50, + help="Maximum number of chunk size") + parser.add_argument("-f", "--found", action="store_true", + help="Also show found items. " + "Helpful to debug the parser."), + parser.add_argument("-d", "--dry-run", action="store_true", + help="Don't actually search for undefined. " + "Helpful to debug the parser."), + + parser.set_defaults(func=self.run) + + def run(self, args): + """Run subparser""" + + abi = AbiRegex(args.dir, debug=args.debug, + search_string=args.search_string) + + abi_symbols = SystemSymbols(abi=abi, hints=args.show_hints, + sysfs=args.sysfs_dir) + + abi_symbols.check_undefined_symbols(dry_run=args.dry_run, + found=args.found, + max_workers=args.jobs, + chunk_size=args.max_chunk_size) + def main(): """Main program""" @@ -125,6 +192,7 @@ def main(): AbiRest(subparsers) AbiValidate(subparsers) AbiSearch(subparsers) + AbiUndefined(subparsers) args = parser.parse_args() diff --git a/scripts/lib/abi/abi_regex.py b/scripts/lib/abi/abi_regex.py new file mode 100644 index 0000000000000..8a57846cbc69c --- /dev/null +++ b/scripts/lib/abi/abi_regex.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +# xxpylint: disable=R0903 +# Copyright(c) 2025: Mauro Carvalho Chehab . +# SPDX-License-Identifier: GPL-2.0 + +""" +Convert ABI what into regular expressions +""" + +import re +import sys + +from pprint import pformat + +from abi_parser import AbiParser +from helpers import AbiDebug + +class AbiRegex(AbiParser): + """Extends AbiParser to search ABI nodes with regular expressions""" + + # Escape only ASCII visible characters + escape_symbols = r"([\x21-\x29\x2b-\x2d\x3a-\x40\x5c\x60\x7b-\x7e])" + leave_others = "others" + + # Tuples with regular expressions to be compiled and replacement data + re_whats = [ + # Drop escape characters that might exist + (re.compile("\\\\"), ""), + + # Temporarily escape dot characters + (re.compile(r"\."), "\xf6"), + + # Temporarily change [0-9]+ type of patterns + (re.compile(r"\[0\-9\]\+"), "\xff"), + + # Temporarily change [\d+-\d+] type of patterns + (re.compile(r"\[0\-\d+\]"), "\xff"), + (re.compile(r"\[0:\d+\]"), "\xff"), + (re.compile(r"\[(\d+)\]"), "\xf4\\\\d+\xf5"), + + # Temporarily change [0-9] type of patterns + (re.compile(r"\[(\d)\-(\d)\]"), "\xf4\1-\2\xf5"), + + # Handle multiple option patterns + (re.compile(r"[\{\<\[]([\w_]+)(?:[,|]+([\w_]+)){1,}[\}\>\]]"), r"(\1|\2)"), + + # Handle wildcards + (re.compile(r"([^\/])\*"), "\\1\\\\w\xf7"), + (re.compile(r"/\*/"), "/.*/"), + (re.compile(r"/\xf6\xf6\xf6"), "/.*"), + (re.compile(r"\<[^\>]+\>"), "\\\\w\xf7"), + (re.compile(r"\{[^\}]+\}"), "\\\\w\xf7"), + (re.compile(r"\[[^\]]+\]"), "\\\\w\xf7"), + + (re.compile(r"XX+"), "\\\\w\xf7"), + (re.compile(r"([^A-Z])[XYZ]([^A-Z])"), "\\1\\\\w\xf7\\2"), + (re.compile(r"([^A-Z])[XYZ]$"), "\\1\\\\w\xf7"), + (re.compile(r"_[AB]_"), "_\\\\w\xf7_"), + + # Recover [0-9] type of patterns + (re.compile(r"\xf4"), "["), + (re.compile(r"\xf5"), "]"), + + # Remove duplicated spaces + (re.compile(r"\s+"), r" "), + + # Special case: drop comparison as in: + # What: foo = + # (this happens on a few IIO definitions) + (re.compile(r"\s*\=.*$"), ""), + + # Escape all other symbols + (re.compile(escape_symbols), r"\\\1"), + (re.compile(r"\\\\"), r"\\"), + (re.compile(r"\\([\[\]\(\)\|])"), r"\1"), + (re.compile(r"(\d+)\\(-\d+)"), r"\1\2"), + + (re.compile(r"\xff"), r"\\d+"), + + # Special case: IIO ABI which a parenthesis. + (re.compile(r"sqrt(.*)"), r"sqrt(.*)"), + + # Simplify regexes with multiple .* + (re.compile(r"(?:\.\*){2,}"), ""), + + # Recover dot characters + (re.compile(r"\xf6"), "\\."), + # Recover plus characters + (re.compile(r"\xf7"), "+"), + ] + re_has_num = re.compile(r"\\d") + + # Symbol name after escape_chars that are considered a devnode basename + re_symbol_name = re.compile(r"(\w|\\[\.\-\:])+$") + + # List of popular group names to be skipped to minimize regex group size + # Use AbiDebug.SUBGROUP_SIZE to detect those + skip_names = set(["devices", "hwmon"]) + + def regex_append(self, what, new): + """ + Get a search group for a subset of regular expressions. + + As ABI may have thousands of symbols, using a for to search all + regular expressions is at least O(n^2). When there are wildcards, + the complexity increases substantially, eventually becoming exponential. + + To avoid spending too much time on them, use a logic to split + them into groups. The smaller the group, the better, as it would + mean that searches will be confined to a small number of regular + expressions. + + The conversion to a regex subset is tricky, as we need something + that can be easily obtained from the sysfs symbol and from the + regular expression. So, we need to discard nodes that have + wildcards. + + If it can't obtain a subgroup, place the regular expression inside + a special group (self.leave_others). + """ + + search_group = None + + for search_group in reversed(new.split("/")): + if not search_group or search_group in self.skip_names: + continue + if self.re_symbol_name.match(search_group): + break + + if not search_group: + search_group = self.leave_others + + if self.debug & AbiDebug.SUBGROUP_MAP: + self.log.debug("%s: mapped as %s", what, search_group) + + try: + if search_group not in self.regex_group: + self.regex_group[search_group] = [] + + self.regex_group[search_group].append(re.compile(new)) + if self.search_string: + if what.find(self.search_string) >= 0: + print(f"What: {what}") + except re.PatternError: + self.log.warning("Ignoring '%s' as it produced an invalid regex:\n" + " '%s'", what, new) + + def get_regexes(self, what): + """ + Given an ABI devnode, return a list of all regular expressions that + may match it, based on the sub-groups created by regex_append() + """ + + re_list = [] + + patches = what.split("/") + patches.reverse() + patches.append(self.leave_others) + + for search_group in patches: + if search_group in self.regex_group: + re_list += self.regex_group[search_group] + + return re_list + + def __init__(self, *args, **kwargs): + """ + Override init method to get verbose argument + """ + + self.regex_group = None + self.search_string = None + self.re_string = None + + if "search_string" in kwargs: + self.search_string = kwargs.get("search_string") + del kwargs["search_string"] + + if self.search_string: + + try: + self.re_string = re.compile(self.search_string) + except re.PatternError as e: + msg = f"{self.search_string} is not a valid regular expression" + raise ValueError(msg) from e + + super().__init__(*args, **kwargs) + + def parse_abi(self, *args, **kwargs): + + super().parse_abi(*args, **kwargs) + + self.regex_group = {} + + print("Converting ABI What fields into regexes...", file=sys.stderr) + + for t in sorted(self.data.items(), key=lambda x: x[0]): + v = t[1] + if v.get("type") == "File": + continue + + v["regex"] = [] + + for what in v.get("what", []): + if not what.startswith("/sys"): + continue + + new = what + for r, s in self.re_whats: + try: + new = r.sub(s, new) + except re.PatternError as e: + # Help debugging troubles with new regexes + raise re.PatternError(f"{e}\nwhile re.sub('{r.pattern}', {s}, str)") from e + + v["regex"].append(new) + + if self.debug & AbiDebug.REGEX: + self.log.debug("%-90s <== %s", new, what) + + # Store regex into a subgroup to speedup searches + self.regex_append(what, new) + + if self.debug & AbiDebug.SUBGROUP_DICT: + self.log.debug("%s", pformat(self.regex_group)) + + if self.debug & AbiDebug.SUBGROUP_SIZE: + biggestd_keys = sorted(self.regex_group.keys(), + key= lambda k: len(self.regex_group[k]), + reverse=True) + + print("Top regex subgroups:", file=sys.stderr) + for k in biggestd_keys[:10]: + print(f"{k} has {len(self.regex_group[k])} elements", file=sys.stderr) diff --git a/scripts/lib/abi/helpers.py b/scripts/lib/abi/helpers.py index 84a253ed50587..639b23e4ca33c 100644 --- a/scripts/lib/abi/helpers.py +++ b/scripts/lib/abi/helpers.py @@ -16,13 +16,23 @@ class AbiDebug: WHAT_PARSING = 1 WHAT_OPEN = 2 DUMP_ABI_STRUCTS = 4 + UNDEFINED = 8 + REGEX = 16 + SUBGROUP_MAP = 32 + SUBGROUP_DICT = 64 + SUBGROUP_SIZE = 128 + GRAPH = 256 DEBUG_HELP = """ -Print debug information according with the level(s), -which is given by the following bitmask: - 1 - enable debug parsing logic 2 - enable debug messages on file open 4 - enable debug for ABI parse data +8 - enable extra debug information to identify troubles + with ABI symbols found at the local machine that + weren't found on ABI documentation (used only for + undefined subcommand) +16 - enable debug for what to regex conversion +32 - enable debug for symbol regex subgroups +64 - enable debug for sysfs graph tree variable """ diff --git a/scripts/lib/abi/system_symbols.py b/scripts/lib/abi/system_symbols.py new file mode 100644 index 0000000000000..f15c94a6e33ce --- /dev/null +++ b/scripts/lib/abi/system_symbols.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +# pylint: disable=R0902,R0912,R0914,R0915,R1702 +# Copyright(c) 2025: Mauro Carvalho Chehab . +# SPDX-License-Identifier: GPL-2.0 + +""" +Parse ABI documentation and produce results from it. +""" + +import os +import re +import sys + +from concurrent import futures +from datetime import datetime +from random import shuffle + +from helpers import AbiDebug + +class SystemSymbols: + """Stores arguments for the class and initialize class vars""" + + def graph_add_file(self, path, link=None): + """ + add a file path to the sysfs graph stored at self.root + """ + + if path in self.files: + return + + name = "" + ref = self.root + for edge in path.split("/"): + name += edge + "/" + if edge not in ref: + ref[edge] = {"__name": [name.rstrip("/")]} + + ref = ref[edge] + + if link and link not in ref["__name"]: + ref["__name"].append(link.rstrip("/")) + + self.files.add(path) + + def print_graph(self, root_prefix="", root=None, level=0): + """Prints a reference tree graph using UTF-8 characters""" + + if not root: + root = self.root + level = 0 + + # Prevent endless traverse + if level > 5: + return + + if level > 0: + prefix = "├──" + last_prefix = "└──" + else: + prefix = "" + last_prefix = "" + + items = list(root.items()) + + names = root.get("__name", []) + for k, edge in items: + if k == "__name": + continue + + if not k: + k = "/" + + if len(names) > 1: + k += " links: " + ",".join(names[1:]) + + if edge == items[-1][1]: + print(root_prefix + last_prefix + k) + p = root_prefix + if level > 0: + p += " " + self.print_graph(p, edge, level + 1) + else: + print(root_prefix + prefix + k) + p = root_prefix + "│ " + self.print_graph(p, edge, level + 1) + + def _walk(self, root): + """ + Walk through sysfs to get all devnodes that aren't ignored. + + By default, uses /sys as sysfs mounting point. If another + directory is used, it replaces them to /sys at the patches. + """ + + with os.scandir(root) as obj: + for entry in obj: + path = os.path.join(root, entry.name) + if self.sysfs: + p = path.replace(self.sysfs, "/sys", count=1) + else: + p = path + + if self.re_ignore.search(p): + return + + # Handle link first to avoid directory recursion + if entry.is_symlink(): + real = os.path.realpath(path) + if not self.sysfs: + self.aliases[path] = real + else: + real = real.replace(self.sysfs, "/sys", count=1) + + # Add absfile location to graph if it doesn't exist + if not self.re_ignore.search(real): + # Add link to the graph + self.graph_add_file(real, p) + + elif entry.is_file(): + self.graph_add_file(p) + + elif entry.is_dir(): + self._walk(path) + + def __init__(self, abi, sysfs="/sys", hints=False): + """ + Initialize internal variables and get a list of all files inside + sysfs that can currently be parsed. + + Please notice that there are several entries on sysfs that aren't + documented as ABI. Ignore those. + + The real paths will be stored under self.files. Aliases will be + stored in separate, as self.aliases. + """ + + self.abi = abi + self.log = abi.log + + if sysfs != "/sys": + self.sysfs = sysfs.rstrip("/") + else: + self.sysfs = None + + self.hints = hints + + self.root = {} + self.aliases = {} + self.files = set() + + dont_walk = [ + # Those require root access and aren't documented at ABI + f"^{sysfs}/kernel/debug", + f"^{sysfs}/kernel/tracing", + f"^{sysfs}/fs/pstore", + f"^{sysfs}/fs/bpf", + f"^{sysfs}/fs/fuse", + + # This is not documented at ABI + f"^{sysfs}/module", + + f"^{sysfs}/fs/cgroup", # this is big and has zero docs under ABI + f"^{sysfs}/firmware", # documented elsewhere: ACPI, DT bindings + "sections|notes", # aren't actually part of ABI + + # kernel-parameters.txt - not easy to parse + "parameters", + ] + + self.re_ignore = re.compile("|".join(dont_walk)) + + print(f"Reading {sysfs} directory contents...", file=sys.stderr) + self._walk(sysfs) + + def check_file(self, refs, found): + """Check missing ABI symbols for a given sysfs file""" + + res_list = [] + + try: + for names in refs: + fname = names[0] + + res = { + "found": False, + "fname": fname, + "msg": "", + } + res_list.append(res) + + re_what = self.abi.get_regexes(fname) + if not re_what: + self.abi.log.warning(f"missing rules for {fname}") + continue + + for name in names: + for r in re_what: + if self.abi.debug & AbiDebug.UNDEFINED: + self.log.debug("check if %s matches '%s'", name, r.pattern) + if r.match(name): + res["found"] = True + if found: + res["msg"] += f" {fname}: regex:\n\t" + continue + + if self.hints and not res["found"]: + res["msg"] += f" {fname} not found. Tested regexes:\n" + for r in re_what: + res["msg"] += " " + r.pattern + "\n" + + except KeyboardInterrupt: + pass + + return res_list + + def _ref_interactor(self, root): + """Recursive function to interact over the sysfs tree""" + + for k, v in root.items(): + if isinstance(v, dict): + yield from self._ref_interactor(v) + + if root == self.root or k == "__name": + continue + + if self.abi.re_string: + fname = v["__name"][0] + if self.abi.re_string.search(fname): + yield v + else: + yield v + + + def get_fileref(self, all_refs, chunk_size): + """Interactor to group refs into chunks""" + + n = 0 + refs = [] + + for ref in all_refs: + refs.append(ref) + + n += 1 + if n >= chunk_size: + yield refs + n = 0 + refs = [] + + yield refs + + def check_undefined_symbols(self, max_workers=None, chunk_size=50, + found=None, dry_run=None): + """Seach ABI for sysfs symbols missing documentation""" + + self.abi.parse_abi() + + if self.abi.debug & AbiDebug.GRAPH: + self.print_graph() + + all_refs = [] + for ref in self._ref_interactor(self.root): + all_refs.append(ref["__name"]) + + if dry_run: + print("Would check", file=sys.stderr) + for ref in all_refs: + print(", ".join(ref)) + + return + + print("Starting to search symbols (it may take several minutes):", + file=sys.stderr) + start = datetime.now() + old_elapsed = None + + # Python doesn't support multithreading due to limitations on its + # global lock (GIL). While Python 3.13 finally made GIL optional, + # there are still issues related to it. Also, we want to have + # backward compatibility with older versions of Python. + # + # So, use instead multiprocess. However, Python is very slow passing + # data from/to multiple processes. Also, it may consume lots of memory + # if the data to be shared is not small. So, we need to group workload + # in chunks that are big enough to generate performance gains while + # not being so big that would cause out-of-memory. + + num_refs = len(all_refs) + print(f"Number of references to parse: {num_refs}", file=sys.stderr) + + if not max_workers: + max_workers = os.cpu_count() + elif max_workers > os.cpu_count(): + max_workers = os.cpu_count() + + max_workers = max(max_workers, 1) + + max_chunk_size = int((num_refs + max_workers - 1) / max_workers) + chunk_size = min(chunk_size, max_chunk_size) + chunk_size = max(1, chunk_size) + + if max_workers > 1: + executor = futures.ProcessPoolExecutor + + # Place references in a random order. This may help improving + # performance, by mixing complex/simple expressions when creating + # chunks + shuffle(all_refs) + else: + # Python has a high overhead with processes. When there's just + # one worker, it is faster to not create a new process. + # Yet, User still deserves to have a progress print. So, use + # python's "thread", which is actually a single process, using + # an internal schedule to switch between tasks. No performance + # gains for non-IO tasks, but still it can be quickly interrupted + # from time to time to display progress. + executor = futures.ThreadPoolExecutor + + not_found = [] + f_list = [] + with executor(max_workers=max_workers) as exe: + for refs in self.get_fileref(all_refs, chunk_size): + if refs: + try: + f_list.append(exe.submit(self.check_file, refs, found)) + + except KeyboardInterrupt: + return + + total = len(f_list) + + if not total: + if self.abi.re_string: + print(f"No ABI symbol matches {self.abi.search_string}") + else: + self.abi.log.warning("No ABI symbols found") + return + + print(f"{len(f_list):6d} jobs queued on {max_workers} workers", + file=sys.stderr) + + while f_list: + try: + t = futures.wait(f_list, timeout=1, + return_when=futures.FIRST_COMPLETED) + + done = t[0] + + for fut in done: + res_list = fut.result() + + for res in res_list: + if not res["found"]: + not_found.append(res["fname"]) + if res["msg"]: + print(res["msg"]) + + f_list.remove(fut) + except KeyboardInterrupt: + return + + except RuntimeError as e: + self.abi.log.warning(f"Future: {e}") + break + + if sys.stderr.isatty(): + elapsed = str(datetime.now() - start).split(".", maxsplit=1)[0] + if len(f_list) < total: + elapsed += f" ({total - len(f_list)}/{total} jobs completed). " + if elapsed != old_elapsed: + print(elapsed + "\r", end="", flush=True, + file=sys.stderr) + old_elapsed = elapsed + + elapsed = str(datetime.now() - start).split(".", maxsplit=1)[0] + print(elapsed, file=sys.stderr) + + for f in sorted(not_found): + print(f"{f} not found.") -- GitLab From 1c7e66bc5d20ac7779130e146d70066b3af4711c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2025 11:18:16 +0100 Subject: [PATCH 197/934] scripts/get_abi.pl: drop now obsoleted script As all functionalities of it were migrated to get_abi.py, drop the now obsoleted script. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/698ec258b36b63ccde5f7da1af9c97cf8df51050.1739182025.git.mchehab+huawei@kernel.org --- scripts/get_abi.pl | 1103 -------------------------------------------- 1 file changed, 1103 deletions(-) delete mode 100755 scripts/get_abi.pl diff --git a/scripts/get_abi.pl b/scripts/get_abi.pl deleted file mode 100755 index de1c0354b50ca..0000000000000 --- a/scripts/get_abi.pl +++ /dev/null @@ -1,1103 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 - -BEGIN { $Pod::Usage::Formatter = 'Pod::Text::Termcap'; } - -use strict; -use warnings; -use utf8; -use Pod::Usage qw(pod2usage); -use Getopt::Long; -use File::Find; -use IO::Handle; -use Fcntl ':mode'; -use Cwd 'abs_path'; -use Data::Dumper; - -my $help = 0; -my $hint = 0; -my $man = 0; -my $debug = 0; -my $enable_lineno = 0; -my $show_warnings = 1; -my $prefix="Documentation/ABI"; -my $sysfs_prefix="/sys"; -my $search_string; - -# Debug options -my $dbg_what_parsing = 1; -my $dbg_what_open = 2; -my $dbg_dump_abi_structs = 4; -my $dbg_undefined = 8; - -$Data::Dumper::Indent = 1; -$Data::Dumper::Terse = 1; - -# -# If true, assumes that the description is formatted with ReST -# -my $description_is_rst = 1; - -GetOptions( - "debug=i" => \$debug, - "enable-lineno" => \$enable_lineno, - "rst-source!" => \$description_is_rst, - "dir=s" => \$prefix, - 'help|?' => \$help, - "show-hints" => \$hint, - "search-string=s" => \$search_string, - man => \$man -) or pod2usage(2); - -pod2usage(1) if $help; -pod2usage(-exitstatus => 0, -noperldoc, -verbose => 2) if $man; - -pod2usage(2) if (scalar @ARGV < 1 || @ARGV > 2); - -my ($cmd, $arg) = @ARGV; - -pod2usage(2) if ($cmd ne "search" && $cmd ne "rest" && $cmd ne "validate" && $cmd ne "undefined"); -pod2usage(2) if ($cmd eq "search" && !$arg); - -require Data::Dumper if ($debug & $dbg_dump_abi_structs); - -my %data; -my %symbols; - -# -# Displays an error message, printing file name and line -# -sub parse_error($$$$) { - my ($file, $ln, $msg, $data) = @_; - - return if (!$show_warnings); - - $data =~ s/\s+$/\n/; - - print STDERR "Warning: file $file#$ln:\n\t$msg"; - - if ($data ne "") { - print STDERR ". Line\n\t\t$data"; - } else { - print STDERR "\n"; - } -} - -# -# Parse an ABI file, storing its contents at %data -# -sub parse_abi { - my $file = $File::Find::name; - - my $mode = (stat($file))[2]; - return if ($mode & S_IFDIR); - return if ($file =~ m,/README,); - return if ($file =~ m,/\.,); - return if ($file =~ m,\.(rej|org|orig|bak)$,); - - my $name = $file; - $name =~ s,.*/,,; - - my $fn = $file; - $fn =~ s,.*Documentation/ABI/,,; - - my $nametag = "File $fn"; - $data{$nametag}->{what} = "File $name"; - $data{$nametag}->{type} = "File"; - $data{$nametag}->{file} = $name; - $data{$nametag}->{filepath} = $file; - $data{$nametag}->{is_file} = 1; - $data{$nametag}->{line_no} = 1; - - my $type = $file; - $type =~ s,.*/(.*)/.*,$1,; - - my $what; - my $new_what; - my $tag = ""; - my $ln; - my $xrefs; - my $space; - my @labels; - my $label = ""; - - print STDERR "Opening $file\n" if ($debug & $dbg_what_open); - open IN, $file; - while() { - $ln++; - if (m/^(\S+)(:\s*)(.*)/i) { - my $new_tag = lc($1); - my $sep = $2; - my $content = $3; - - if (!($new_tag =~ m/(what|where|date|kernelversion|contact|description|users)/)) { - if ($tag eq "description") { - # New "tag" is actually part of - # description. Don't consider it a tag - $new_tag = ""; - } elsif ($tag ne "") { - parse_error($file, $ln, "tag '$tag' is invalid", $_); - } - } - - # Invalid, but it is a common mistake - if ($new_tag eq "where") { - parse_error($file, $ln, "tag 'Where' is invalid. Should be 'What:' instead", ""); - $new_tag = "what"; - } - - if ($new_tag =~ m/what/) { - $space = ""; - $content =~ s/[,.;]$//; - - push @{$symbols{$content}->{file}}, " $file:" . ($ln - 1); - - if ($tag =~ m/what/) { - $what .= "\xac" . $content; - } else { - if ($what) { - parse_error($file, $ln, "What '$what' doesn't have a description", "") if (!$data{$what}->{description}); - - foreach my $w(split /\xac/, $what) { - $symbols{$w}->{xref} = $what; - }; - } - - $what = $content; - $label = $content; - $new_what = 1; - } - push @labels, [($content, $label)]; - $tag = $new_tag; - - push @{$data{$nametag}->{symbols}}, $content if ($data{$nametag}->{what}); - next; - } - - if ($tag ne "" && $new_tag) { - $tag = $new_tag; - - if ($new_what) { - @{$data{$what}->{label_list}} = @labels if ($data{$nametag}->{what}); - @labels = (); - $label = ""; - $new_what = 0; - - $data{$what}->{type} = $type; - if (!defined($data{$what}->{file})) { - $data{$what}->{file} = $name; - $data{$what}->{filepath} = $file; - } else { - $data{$what}->{description} .= "\n\n" if (defined($data{$what}->{description})); - if ($name ne $data{$what}->{file}) { - $data{$what}->{file} .= " " . $name; - $data{$what}->{filepath} .= " " . $file; - } - } - print STDERR "\twhat: $what\n" if ($debug & $dbg_what_parsing); - $data{$what}->{line_no} = $ln; - } else { - $data{$what}->{line_no} = $ln if (!defined($data{$what}->{line_no})); - } - - if (!$what) { - parse_error($file, $ln, "'What:' should come first:", $_); - next; - } - if ($new_tag eq "description") { - $sep =~ s,:, ,; - $content = ' ' x length($new_tag) . $sep . $content; - while ($content =~ s/\t+/' ' x (length($&) * 8 - length($`) % 8)/e) {} - if ($content =~ m/^(\s*)(\S.*)$/) { - # Preserve initial spaces for the first line - $space = $1; - $content = "$2\n"; - $data{$what}->{$tag} .= $content; - } else { - undef($space); - } - - } else { - $data{$what}->{$tag} = $content; - } - next; - } - } - - # Store any contents before tags at the database - if (!$tag && $data{$nametag}->{what}) { - $data{$nametag}->{description} .= $_; - next; - } - - if ($tag eq "description") { - my $content = $_; - while ($content =~ s/\t+/' ' x (length($&) * 8 - length($`) % 8)/e) {} - if (m/^\s*\n/) { - $data{$what}->{$tag} .= "\n"; - next; - } - - if (!defined($space)) { - # Preserve initial spaces for the first line - if ($content =~ m/^(\s*)(\S.*)$/) { - $space = $1; - $content = "$2\n"; - } - } else { - $space = "" if (!($content =~ s/^($space)//)); - } - $data{$what}->{$tag} .= $content; - - next; - } - if (m/^\s*(.*)/) { - $data{$what}->{$tag} .= "\n$1"; - $data{$what}->{$tag} =~ s/\n+$//; - next; - } - - # Everything else is error - parse_error($file, $ln, "Unexpected content", $_); - } - $data{$nametag}->{description} =~ s/^\n+// if ($data{$nametag}->{description}); - if ($what) { - parse_error($file, $ln, "What '$what' doesn't have a description", "") if (!$data{$what}->{description}); - - foreach my $w(split /\xac/,$what) { - $symbols{$w}->{xref} = $what; - }; - } - close IN; -} - -sub create_labels { - my %labels; - - foreach my $what (keys %data) { - next if ($data{$what}->{file} eq "File"); - - foreach my $p (@{$data{$what}->{label_list}}) { - my ($content, $label) = @{$p}; - $label = "abi_" . $label . " "; - $label =~ tr/A-Z/a-z/; - - # Convert special chars to "_" - $label =~s/([\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\xff])/_/g; - $label =~ s,_+,_,g; - $label =~ s,_$,,; - - # Avoid duplicated labels - while (defined($labels{$label})) { - my @chars = ("A".."Z", "a".."z"); - $label .= $chars[rand @chars]; - } - $labels{$label} = 1; - - $data{$what}->{label} = $label; - - # only one label is enough - last; - } - } -} - -# -# Outputs the book on ReST format -# - -# \b doesn't work well with paths. So, we need to define something else: -# Boundaries are punct characters, spaces and end-of-line -my $start = qr {(^|\s|\() }x; -my $bondary = qr { ([,.:;\)\s]|\z) }x; -my $xref_match = qr { $start(\/(sys|config|proc|dev|kvd)\/[^,.:;\)\s]+)$bondary }x; -my $symbols = qr { ([\x01-\x08\x0e-\x1f\x21-\x2f\x3a-\x40\x7b-\xff]) }x; - -sub output_rest { - create_labels(); - - my $part = ""; - - foreach my $what (sort { - ($data{$a}->{type} eq "File") cmp ($data{$b}->{type} eq "File") || - $a cmp $b - } keys %data) { - my $type = $data{$what}->{type}; - - my @file = split / /, $data{$what}->{file}; - my @filepath = split / /, $data{$what}->{filepath}; - - if ($enable_lineno) { - printf ".. LINENO %s%s#%s\n\n", - $prefix, $file[0], - $data{$what}->{line_no}; - } - - my $w = $what; - - if ($type ne "File") { - my $cur_part = $what; - if ($what =~ '/') { - if ($what =~ m#^(\/?(?:[\w\-]+\/?){1,2})#) { - $cur_part = "Symbols under $1"; - $cur_part =~ s,/$,,; - } - } - - if ($cur_part ne "" && $part ne $cur_part) { - $part = $cur_part; - my $bar = $part; - $bar =~ s/./-/g; - print "$part\n$bar\n\n"; - } - - printf ".. _%s:\n\n", $data{$what}->{label}; - - my @names = split /\xac/,$w; - my $len = 0; - - foreach my $name (@names) { - $name =~ s/$symbols/\\$1/g; - $name = "**$name**"; - $len = length($name) if (length($name) > $len); - } - - print "+-" . "-" x $len . "-+\n"; - foreach my $name (@names) { - printf "| %s", $name . " " x ($len - length($name)) . " |\n"; - print "+-" . "-" x $len . "-+\n"; - } - - print "\n"; - } - - for (my $i = 0; $i < scalar(@filepath); $i++) { - my $path = $filepath[$i]; - my $f = $file[$i]; - - $path =~ s,.*/(.*/.*),$1,;; - $path =~ s,[/\-],_,g;; - my $fileref = "abi_file_".$path; - - if ($type eq "File") { - print ".. _$fileref:\n\n"; - } else { - print "Defined on file :ref:`$f <$fileref>`\n\n"; - } - } - - if ($type eq "File") { - my $bar = $w; - $bar =~ s/./-/g; - print "$w\n$bar\n\n"; - } - - my $desc = ""; - $desc = $data{$what}->{description} if (defined($data{$what}->{description})); - $desc =~ s/\s+$/\n/; - - if (!($desc =~ /^\s*$/)) { - if ($description_is_rst) { - # Remove title markups from the description - # Having titles inside ABI files will only work if extra - # care would be taken in order to strictly follow the same - # level order for each markup. - $desc =~ s/\n[\-\*\=\^\~]+\n/\n\n/g; - - # Enrich text by creating cross-references - - my $new_desc = ""; - my $init_indent = -1; - my $literal_indent = -1; - - open(my $fh, "+<", \$desc); - while (my $d = <$fh>) { - my $indent = $d =~ m/^(\s+)/; - my $spaces = length($indent); - $init_indent = $indent if ($init_indent < 0); - if ($literal_indent >= 0) { - if ($spaces > $literal_indent) { - $new_desc .= $d; - next; - } else { - $literal_indent = -1; - } - } else { - if ($d =~ /()::$/ && !($d =~ /^\s*\.\./)) { - $literal_indent = $spaces; - } - } - - $d =~ s,Documentation/(?!devicetree)(\S+)\.rst,:doc:`/$1`,g; - - my @matches = $d =~ m,Documentation/ABI/([\w\/\-]+),g; - foreach my $f (@matches) { - my $xref = $f; - my $path = $f; - $path =~ s,.*/(.*/.*),$1,;; - $path =~ s,[/\-],_,g;; - $xref .= " "; - $d =~ s,\bDocumentation/ABI/$f\b,:ref:`$xref`,g; - } - - # Seek for cross reference symbols like /sys/... - @matches = $d =~ m/$xref_match/g; - - foreach my $s (@matches) { - next if (!($s =~ m,/,)); - if (defined($data{$s}) && defined($data{$s}->{label})) { - my $xref = $s; - - $xref =~ s/$symbols/\\$1/g; - $xref = ":ref:`$xref <" . $data{$s}->{label} . ">`"; - - $d =~ s,$start$s$bondary,$1$xref$2,g; - } - } - $new_desc .= $d; - } - close $fh; - - - print "$new_desc\n\n"; - } else { - $desc =~ s/^\s+//; - - # Remove title markups from the description, as they won't work - $desc =~ s/\n[\-\*\=\^\~]+\n/\n\n/g; - - if ($desc =~ m/\:\n/ || $desc =~ m/\n[\t ]+/ || $desc =~ m/[\x00-\x08\x0b-\x1f\x7b-\xff]/) { - # put everything inside a code block - $desc =~ s/\n/\n /g; - - print "::\n\n"; - print " $desc\n\n"; - } else { - # Escape any special chars from description - $desc =~s/([\x00-\x08\x0b-\x1f\x21-\x2a\x2d\x2f\x3c-\x40\x5c\x5e-\x60\x7b-\xff])/\\$1/g; - print "$desc\n\n"; - } - } - } else { - print "DESCRIPTION MISSING for $what\n\n" if (!$data{$what}->{is_file}); - } - - if ($data{$what}->{symbols}) { - printf "Has the following ABI:\n\n"; - - foreach my $content(@{$data{$what}->{symbols}}) { - my $label = $data{$symbols{$content}->{xref}}->{label}; - - # Escape special chars from content - $content =~s/([\x00-\x1f\x21-\x2f\x3a-\x40\x7b-\xff])/\\$1/g; - - print "- :ref:`$content <$label>`\n\n"; - } - } - - if (defined($data{$what}->{users})) { - my $users = $data{$what}->{users}; - - $users =~ s/\n/\n\t/g; - printf "Users:\n\t%s\n\n", $users if ($users ne ""); - } - - } -} - -# -# Searches for ABI symbols -# -sub search_symbols { - foreach my $what (sort keys %data) { - next if (!($what =~ m/($arg)/)); - - my $type = $data{$what}->{type}; - next if ($type eq "File"); - - my $file = $data{$what}->{filepath}; - - $what =~ s/\xac/, /g; - my $bar = $what; - $bar =~ s/./-/g; - - print "\n$what\n$bar\n\n"; - - my $kernelversion = $data{$what}->{kernelversion} if (defined($data{$what}->{kernelversion})); - my $contact = $data{$what}->{contact} if (defined($data{$what}->{contact})); - my $users = $data{$what}->{users} if (defined($data{$what}->{users})); - my $date = $data{$what}->{date} if (defined($data{$what}->{date})); - my $desc = $data{$what}->{description} if (defined($data{$what}->{description})); - - $kernelversion =~ s/^\s+// if ($kernelversion); - $contact =~ s/^\s+// if ($contact); - if ($users) { - $users =~ s/^\s+//; - $users =~ s/\n//g; - } - $date =~ s/^\s+// if ($date); - $desc =~ s/^\s+// if ($desc); - - printf "Kernel version:\t\t%s\n", $kernelversion if ($kernelversion); - printf "Date:\t\t\t%s\n", $date if ($date); - printf "Contact:\t\t%s\n", $contact if ($contact); - printf "Users:\t\t\t%s\n", $users if ($users); - print "Defined on file(s):\t$file\n\n"; - print "Description:\n\n$desc"; - } -} - -# Exclude /sys/kernel/debug and /sys/kernel/tracing from the search path -sub dont_parse_special_attributes { - if (($File::Find::dir =~ m,^/sys/kernel,)) { - return grep {!/(debug|tracing)/ } @_; - } - - if (($File::Find::dir =~ m,^/sys/fs,)) { - return grep {!/(pstore|bpf|fuse)/ } @_; - } - - return @_ -} - -my %leaf; -my %aliases; -my @files; -my %root; - -sub graph_add_file { - my $file = shift; - my $type = shift; - - my $dir = $file; - $dir =~ s,^(.*/).*,$1,; - $file =~ s,.*/,,; - - my $name; - my $file_ref = \%root; - foreach my $edge(split "/", $dir) { - $name .= "$edge/"; - if (!defined ${$file_ref}{$edge}) { - ${$file_ref}{$edge} = { }; - } - $file_ref = \%{$$file_ref{$edge}}; - ${$file_ref}{"__name"} = [ $name ]; - } - $name .= "$file"; - ${$file_ref}{$file} = { - "__name" => [ $name ] - }; - - return \%{$$file_ref{$file}}; -} - -sub graph_add_link { - my $file = shift; - my $link = shift; - - # Traverse graph to find the reference - my $file_ref = \%root; - foreach my $edge(split "/", $file) { - $file_ref = \%{$$file_ref{$edge}} || die "Missing node!"; - } - - # do a BFS - - my @queue; - my %seen; - my $st; - - push @queue, $file_ref; - $seen{$start}++; - - while (@queue) { - my $v = shift @queue; - my @child = keys(%{$v}); - - foreach my $c(@child) { - next if $seen{$$v{$c}}; - next if ($c eq "__name"); - - if (!defined($$v{$c}{"__name"})) { - printf STDERR "Error: Couldn't find a non-empty name on a children of $file/.*: "; - print STDERR Dumper(%{$v}); - exit; - } - - # Add new name - my $name = @{$$v{$c}{"__name"}}[0]; - if ($name =~ s#^$file/#$link/#) { - push @{$$v{$c}{"__name"}}, $name; - } - # Add child to the queue and mark as seen - push @queue, $$v{$c}; - $seen{$c}++; - } - } -} - -my $escape_symbols = qr { ([\x01-\x08\x0e-\x1f\x21-\x29\x2b-\x2d\x3a-\x40\x7b-\xfe]) }x; -sub parse_existing_sysfs { - my $file = $File::Find::name; - - my $mode = (lstat($file))[2]; - my $abs_file = abs_path($file); - - my @tmp; - push @tmp, $file; - push @tmp, $abs_file if ($abs_file ne $file); - - foreach my $f(@tmp) { - # Ignore cgroup, as this is big and has zero docs under ABI - return if ($f =~ m#^/sys/fs/cgroup/#); - - # Ignore firmware as it is documented elsewhere - # Either ACPI or under Documentation/devicetree/bindings/ - return if ($f =~ m#^/sys/firmware/#); - - # Ignore some sysfs nodes that aren't actually part of ABI - return if ($f =~ m#/sections|notes/#); - - # Would need to check at - # Documentation/admin-guide/kernel-parameters.txt, but this - # is not easily parseable. - return if ($f =~ m#/parameters/#); - } - - if (S_ISLNK($mode)) { - $aliases{$file} = $abs_file; - return; - } - - return if (S_ISDIR($mode)); - - # Trivial: file is defined exactly the same way at ABI What: - return if (defined($data{$file})); - return if (defined($data{$abs_file})); - - push @files, graph_add_file($abs_file, "file"); -} - -sub get_leave($) -{ - my $what = shift; - my $leave; - - my $l = $what; - my $stop = 1; - - $leave = $l; - $leave =~ s,/$,,; - $leave =~ s,.*/,,; - $leave =~ s/[\(\)]//g; - - # $leave is used to improve search performance at - # check_undefined_symbols, as the algorithm there can seek - # for a small number of "what". It also allows giving a - # hint about a leave with the same name somewhere else. - # However, there are a few occurences where the leave is - # either a wildcard or a number. Just group such cases - # altogether. - if ($leave =~ m/\.\*/ || $leave eq "" || $leave =~ /\\d/) { - $leave = "others"; - } - - return $leave; -} - -my @not_found; - -sub check_file($$) -{ - my $file_ref = shift; - my $names_ref = shift; - my @names = @{$names_ref}; - my $file = $names[0]; - - my $found_string; - - my $leave = get_leave($file); - if (!defined($leaf{$leave})) { - $leave = "others"; - } - my @expr = @{$leaf{$leave}->{expr}}; - die ("\rmissing rules for $leave") if (!defined($leaf{$leave})); - - my $path = $file; - $path =~ s,(.*/).*,$1,; - - if ($search_string) { - return if (!($file =~ m#$search_string#)); - $found_string = 1; - } - - for (my $i = 0; $i < @names; $i++) { - if ($found_string && $hint) { - if (!$i) { - print STDERR "--> $names[$i]\n"; - } else { - print STDERR " $names[$i]\n"; - } - } - foreach my $re (@expr) { - print STDERR "$names[$i] =~ /^$re\$/\n" if ($debug && $dbg_undefined); - if ($names[$i] =~ $re) { - return; - } - } - } - - if ($leave ne "others") { - my @expr = @{$leaf{"others"}->{expr}}; - for (my $i = 0; $i < @names; $i++) { - foreach my $re (@expr) { - print STDERR "$names[$i] =~ /^$re\$/\n" if ($debug && $dbg_undefined); - if ($names[$i] =~ $re) { - return; - } - } - } - } - - push @not_found, $file if (!$search_string || $found_string); - - if ($hint && (!$search_string || $found_string)) { - my $what = $leaf{$leave}->{what}; - $what =~ s/\xac/\n\t/g; - if ($leave ne "others") { - print STDERR "\r more likely regexes:\n\t$what\n"; - } else { - print STDERR "\r tested regexes:\n\t$what\n"; - } - } -} - -sub check_undefined_symbols { - my $num_files = scalar @files; - my $next_i = 0; - my $start_time = times; - - @files = sort @files; - - my $last_time = $start_time; - - # When either debug or hint is enabled, there's no sense showing - # progress, as the progress will be overriden. - if ($hint || ($debug && $dbg_undefined)) { - $next_i = $num_files; - } - - my $is_console; - $is_console = 1 if (-t STDERR); - - for (my $i = 0; $i < $num_files; $i++) { - my $file_ref = $files[$i]; - my @names = @{$$file_ref{"__name"}}; - - check_file($file_ref, \@names); - - my $cur_time = times; - - if ($i == $next_i || $cur_time > $last_time + 1) { - my $percent = $i * 100 / $num_files; - - my $tm = $cur_time - $start_time; - my $time = sprintf "%d:%02d", int($tm), 60 * ($tm - int($tm)); - - printf STDERR "\33[2K\r", if ($is_console); - printf STDERR "%s: processing sysfs files... %i%%: $names[0]", $time, $percent; - printf STDERR "\n", if (!$is_console); - STDERR->flush(); - - $next_i = int (($percent + 1) * $num_files / 100); - $last_time = $cur_time; - } - } - - my $cur_time = times; - my $tm = $cur_time - $start_time; - my $time = sprintf "%d:%02d", int($tm), 60 * ($tm - int($tm)); - - printf STDERR "\33[2K\r", if ($is_console); - printf STDERR "%s: processing sysfs files... done\n", $time; - - foreach my $file (@not_found) { - print "$file not found.\n"; - } -} - -sub undefined_symbols { - print STDERR "Reading $sysfs_prefix directory contents..."; - find({ - wanted =>\&parse_existing_sysfs, - preprocess =>\&dont_parse_special_attributes, - no_chdir => 1 - }, $sysfs_prefix); - print STDERR "done.\n"; - - $leaf{"others"}->{what} = ""; - - print STDERR "Converting ABI What fields into regexes..."; - foreach my $w (sort keys %data) { - foreach my $what (split /\xac/,$w) { - next if (!($what =~ m/^$sysfs_prefix/)); - - # Convert what into regular expressions - - # Escape dot characters - $what =~ s/\./\xf6/g; - - # Temporarily change [0-9]+ type of patterns - $what =~ s/\[0\-9\]\+/\xff/g; - - # Temporarily change [\d+-\d+] type of patterns - $what =~ s/\[0\-\d+\]/\xff/g; - $what =~ s/\[(\d+)\]/\xf4$1\xf5/g; - - # Temporarily change [0-9] type of patterns - $what =~ s/\[(\d)\-(\d)\]/\xf4$1-$2\xf5/g; - - # Handle multiple option patterns - $what =~ s/[\{\<\[]([\w_]+)(?:[,|]+([\w_]+)){1,}[\}\>\]]/($1|$2)/g; - - # Handle wildcards - $what =~ s,\*,.*,g; - $what =~ s,/\xf6..,/.*,g; - $what =~ s/\<[^\>]+\>/.*/g; - $what =~ s/\{[^\}]+\}/.*/g; - $what =~ s/\[[^\]]+\]/.*/g; - - $what =~ s/[XYZ]/.*/g; - - # Recover [0-9] type of patterns - $what =~ s/\xf4/[/g; - $what =~ s/\xf5/]/g; - - # Remove duplicated spaces - $what =~ s/\s+/ /g; - - # Special case: this ABI has a parenthesis on it - $what =~ s/sqrt\(x^2\+y^2\+z^2\)/sqrt\(x^2\+y^2\+z^2\)/; - - # Special case: drop comparition as in: - # What: foo = - # (this happens on a few IIO definitions) - $what =~ s,\s*\=.*$,,; - - # Escape all other symbols - $what =~ s/$escape_symbols/\\$1/g; - $what =~ s/\\\\/\\/g; - $what =~ s/\\([\[\]\(\)\|])/$1/g; - $what =~ s/(\d+)\\(-\d+)/$1$2/g; - - $what =~ s/\xff/\\d+/g; - - # Special case: IIO ABI which a parenthesis. - $what =~ s/sqrt(.*)/sqrt\(.*\)/; - - # Simplify regexes with multiple .* - $what =~ s#(?:\.\*){2,}##g; -# $what =~ s#\.\*/\.\*#.*#g; - - # Recover dot characters - $what =~ s/\xf6/\./g; - - my $leave = get_leave($what); - - my $added = 0; - foreach my $l (split /\|/, $leave) { - if (defined($leaf{$l})) { - next if ($leaf{$l}->{what} =~ m/\b$what\b/); - $leaf{$l}->{what} .= "\xac" . $what; - $added = 1; - } else { - $leaf{$l}->{what} = $what; - $added = 1; - } - } - if ($search_string && $added) { - print STDERR "What: $what\n" if ($what =~ m#$search_string#); - } - - } - } - # Compile regexes - foreach my $l (sort keys %leaf) { - my @expr; - foreach my $w(sort split /\xac/, $leaf{$l}->{what}) { - push @expr, qr /^$w$/; - } - $leaf{$l}->{expr} = \@expr; - } - - # Take links into account - foreach my $link (sort keys %aliases) { - my $abs_file = $aliases{$link}; - graph_add_link($abs_file, $link); - } - print STDERR "done.\n"; - - check_undefined_symbols; -} - -# Ensure that the prefix will always end with a slash -# While this is not needed for find, it makes the patch nicer -# with --enable-lineno -$prefix =~ s,/?$,/,; - -if ($cmd eq "undefined" || $cmd eq "search") { - $show_warnings = 0; -} -# -# Parses all ABI files located at $prefix dir -# -find({wanted =>\&parse_abi, no_chdir => 1}, $prefix); - -print STDERR Data::Dumper->Dump([\%data], [qw(*data)]) if ($debug & $dbg_dump_abi_structs); - -# -# Handles the command -# -if ($cmd eq "undefined") { - undefined_symbols; -} elsif ($cmd eq "search") { - search_symbols; -} else { - if ($cmd eq "rest") { - output_rest; - } - - # Warn about duplicated ABI entries - foreach my $what(sort keys %symbols) { - my @files = @{$symbols{$what}->{file}}; - - next if (scalar(@files) == 1); - - printf STDERR "Warning: $what is defined %d times: @files\n", - scalar(@files); - } -} - -__END__ - -=head1 NAME - -get_abi.pl - parse the Linux ABI files and produce a ReST book. - -=head1 SYNOPSIS - -B [--debug ] [--enable-lineno] [--man] [--help] - [--(no-)rst-source] [--dir=] [--show-hints] - [--search-string ] - [] - -Where B can be: - -=over 8 - -B I - search for I inside ABI - -B - output the ABI in ReST markup language - -B - validate the ABI contents - -B - existing symbols at the system that aren't - defined at Documentation/ABI - -=back - -=head1 OPTIONS - -=over 8 - -=item B<--dir> - -Changes the location of the ABI search. By default, it uses -the Documentation/ABI directory. - -=item B<--rst-source> and B<--no-rst-source> - -The input file may be using ReST syntax or not. Those two options allow -selecting between a rst-compliant source ABI (B<--rst-source>), or a -plain text that may be violating ReST spec, so it requres some escaping -logic (B<--no-rst-source>). - -=item B<--enable-lineno> - -Enable output of .. LINENO lines. - -=item B<--debug> I - -Print debug information according with the level, which is given by the -following bitmask: - - - 1: Debug parsing What entries from ABI files; - - 2: Shows what files are opened from ABI files; - - 4: Dump the structs used to store the contents of the ABI files. - -=item B<--show-hints> - -Show hints about possible definitions for the missing ABI symbols. -Used only when B. - -=item B<--search-string> I - -Show only occurences that match a search string. -Used only when B. - -=item B<--help> - -Prints a brief help message and exits. - -=item B<--man> - -Prints the manual page and exits. - -=back - -=head1 DESCRIPTION - -Parse the Linux ABI files from ABI DIR (usually located at Documentation/ABI), -allowing to search for ABI symbols or to produce a ReST book containing -the Linux ABI documentation. - -=head1 EXAMPLES - -Search for all stable symbols with the word "usb": - -=over 8 - -$ scripts/get_abi.pl search usb --dir Documentation/ABI/stable - -=back - -Search for all symbols that match the regex expression "usb.*cap": - -=over 8 - -$ scripts/get_abi.pl search usb.*cap - -=back - -Output all obsoleted symbols in ReST format - -=over 8 - -$ scripts/get_abi.pl rest --dir Documentation/ABI/obsolete - -=back - -=head1 BUGS - -Report bugs to Mauro Carvalho Chehab - -=head1 COPYRIGHT - -Copyright (c) 2016-2021 by Mauro Carvalho Chehab . - -License GPLv2: GNU GPL version 2 . - -This is free software: you are free to change and redistribute it. -There is NO WARRANTY, to the extent permitted by law. - -=cut -- GitLab From 95767a592dc997eab17a4f58fcb16abff2101ba3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 3 Feb 2025 09:46:25 -0800 Subject: [PATCH 198/934] docs: submitting-patches: document the format for affiliation Adding company name in round brackets to From/SoB lines is fairly common, but I don't see it documented anywhere. Every now and then people try to add the sponsorship lines to the commit message, fun example from this merge window: Sponsored by: The FreeBSD Foundation from commit 2ce67f8bf1ce ("wifi: iwlwifi: mvm: fix iwl_ssid_exist() check"). Better format would be: Author: Miri Korenblit (FreeBSD Foundation) <... Signed-off-by: Jakub Kicinski Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20250203174626.1131225-1-kuba@kernel.org --- Documentation/process/submitting-patches.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst index 8fdc0ef3e604f..12ed28b3d1138 100644 --- a/Documentation/process/submitting-patches.rst +++ b/Documentation/process/submitting-patches.rst @@ -717,6 +717,12 @@ patch in the permanent changelog. If the ``from`` line is missing, then the ``From:`` line from the email header will be used to determine the patch author in the changelog. +The author may indicate their affiliation or the sponsor of the work +by adding the name of an organization to the ``from`` and ``SoB`` lines, +e.g.: + + From: Patch Author (Company) + Explanation Body ^^^^^^^^^^^^^^^^ -- GitLab From ccb7735a1ea22621f21c3133e4f5f3da5fe5f5b7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 27 Jan 2025 14:45:23 -0800 Subject: [PATCH 199/934] x86/fpu: Fully optimize out WARN_ON_FPU() Currently WARN_ON_FPU evaluates its argument even if CONFIG_X86_DEBUG_FPU is disabled, which adds unnecessary instructions to several functions, for example kernel_fpu_begin(). Fix this by using BUILD_BUG_ON_INVALID(x) in the no-debug case rather than (void)(x). Fixes: 83242c515881 ("x86/fpu: Make WARN_ON_FPU() more robust in the !CONFIG_X86_DEBUG_FPU case") Suggested-by: Sean Christopherson Signed-off-by: Eric Biggers Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250127224523.94300-1-ebiggers%40kernel.org --- arch/x86/kernel/fpu/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h index dbdb31f55fc7f..975de070c9c98 100644 --- a/arch/x86/kernel/fpu/internal.h +++ b/arch/x86/kernel/fpu/internal.h @@ -18,7 +18,7 @@ static __always_inline __pure bool use_fxsr(void) #ifdef CONFIG_X86_DEBUG_FPU # define WARN_ON_FPU(x) WARN_ON_ONCE(x) #else -# define WARN_ON_FPU(x) ({ (void)(x); 0; }) +# define WARN_ON_FPU(x) ({ BUILD_BUG_ON_INVALID(x); 0; }) #endif /* Used in init.c */ -- GitLab From 7da8e4ad4df0dd12f37357af62ce1b63e75ae2e6 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 15 Jan 2025 14:47:58 +0900 Subject: [PATCH 200/934] elf: Define note name macros elf.h had a comment saying: > Notes used in ET_CORE. Architectures export some of the arch register > sets using the corresponding note types via the PTRACE_GETREGSET and > PTRACE_SETREGSET requests. > The note name for these types is "LINUX", except NT_PRFPREG that is > named "CORE". However, NT_PRSTATUS is also named "CORE". It is also unclear what "these types" refers to. To fix these problems, define a name for each note type. The added definitions are macros so the kernel and userspace can directly refer to them to remove their duplicate definitions of note names. Signed-off-by: Akihiko Odaki Acked-by: Baoquan He Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20250115-elf-v5-1-0f9e55bbb2fc@daynix.com Signed-off-by: Kees Cook --- include/uapi/linux/elf.h | 89 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 84 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index b44069d29cecc..592507aa9b3ad 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -368,101 +368,180 @@ typedef struct elf64_shdr { #define ELF_OSABI ELFOSABI_NONE #endif +/* Note definitions: NN_ defines names. NT_ defines types. */ + +#define NN_GNU_PROPERTY_TYPE_0 "GNU" +#define NT_GNU_PROPERTY_TYPE_0 5 + /* * Notes used in ET_CORE. Architectures export some of the arch register sets * using the corresponding note types via the PTRACE_GETREGSET and * PTRACE_SETREGSET requests. - * The note name for these types is "LINUX", except NT_PRFPREG that is named - * "CORE". */ +#define NN_PRSTATUS "CORE" #define NT_PRSTATUS 1 +#define NN_PRFPREG "CORE" #define NT_PRFPREG 2 +#define NN_PRPSINFO "CORE" #define NT_PRPSINFO 3 +#define NN_TASKSTRUCT "CORE" #define NT_TASKSTRUCT 4 +#define NN_AUXV "CORE" #define NT_AUXV 6 /* * Note to userspace developers: size of NT_SIGINFO note may increase * in the future to accomodate more fields, don't assume it is fixed! */ +#define NN_SIGINFO "CORE" #define NT_SIGINFO 0x53494749 +#define NN_FILE "CORE" #define NT_FILE 0x46494c45 +#define NN_PRXFPREG "LINUX" #define NT_PRXFPREG 0x46e62b7f /* copied from gdb5.1/include/elf/common.h */ +#define NN_PPC_VMX "LINUX" #define NT_PPC_VMX 0x100 /* PowerPC Altivec/VMX registers */ +#define NN_PPC_SPE "LINUX" #define NT_PPC_SPE 0x101 /* PowerPC SPE/EVR registers */ +#define NN_PPC_VSX "LINUX" #define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ +#define NN_PPC_TAR "LINUX" #define NT_PPC_TAR 0x103 /* Target Address Register */ +#define NN_PPC_PPR "LINUX" #define NT_PPC_PPR 0x104 /* Program Priority Register */ +#define NN_PPC_DSCR "LINUX" #define NT_PPC_DSCR 0x105 /* Data Stream Control Register */ +#define NN_PPC_EBB "LINUX" #define NT_PPC_EBB 0x106 /* Event Based Branch Registers */ +#define NN_PPC_PMU "LINUX" #define NT_PPC_PMU 0x107 /* Performance Monitor Registers */ +#define NN_PPC_TM_CGPR "LINUX" #define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +#define NN_PPC_TM_CFPR "LINUX" #define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +#define NN_PPC_TM_CVMX "LINUX" #define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +#define NN_PPC_TM_CVSX "LINUX" #define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +#define NN_PPC_TM_SPR "LINUX" #define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#define NN_PPC_TM_CTAR "LINUX" #define NT_PPC_TM_CTAR 0x10d /* TM checkpointed Target Address Register */ +#define NN_PPC_TM_CPPR "LINUX" #define NT_PPC_TM_CPPR 0x10e /* TM checkpointed Program Priority Register */ +#define NN_PPC_TM_CDSCR "LINUX" #define NT_PPC_TM_CDSCR 0x10f /* TM checkpointed Data Stream Control Register */ +#define NN_PPC_PKEY "LINUX" #define NT_PPC_PKEY 0x110 /* Memory Protection Keys registers */ +#define NN_PPC_DEXCR "LINUX" #define NT_PPC_DEXCR 0x111 /* PowerPC DEXCR registers */ +#define NN_PPC_HASHKEYR "LINUX" #define NT_PPC_HASHKEYR 0x112 /* PowerPC HASHKEYR register */ +#define NN_386_TLS "LINUX" #define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ +#define NN_386_IOPERM "LINUX" #define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ +#define NN_X86_XSTATE "LINUX" #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ /* Old binutils treats 0x203 as a CET state */ +#define NN_X86_SHSTK "LINUX" #define NT_X86_SHSTK 0x204 /* x86 SHSTK state */ +#define NN_X86_XSAVE_LAYOUT "LINUX" #define NT_X86_XSAVE_LAYOUT 0x205 /* XSAVE layout description */ +#define NN_S390_HIGH_GPRS "LINUX" #define NT_S390_HIGH_GPRS 0x300 /* s390 upper register halves */ +#define NN_S390_TIMER "LINUX" #define NT_S390_TIMER 0x301 /* s390 timer register */ +#define NN_S390_TODCMP "LINUX" #define NT_S390_TODCMP 0x302 /* s390 TOD clock comparator register */ +#define NN_S390_TODPREG "LINUX" #define NT_S390_TODPREG 0x303 /* s390 TOD programmable register */ +#define NN_S390_CTRS "LINUX" #define NT_S390_CTRS 0x304 /* s390 control registers */ +#define NN_S390_PREFIX "LINUX" #define NT_S390_PREFIX 0x305 /* s390 prefix register */ +#define NN_S390_LAST_BREAK "LINUX" #define NT_S390_LAST_BREAK 0x306 /* s390 breaking event address */ +#define NN_S390_SYSTEM_CALL "LINUX" #define NT_S390_SYSTEM_CALL 0x307 /* s390 system call restart data */ +#define NN_S390_TDB "LINUX" #define NT_S390_TDB 0x308 /* s390 transaction diagnostic block */ +#define NN_S390_VXRS_LOW "LINUX" #define NT_S390_VXRS_LOW 0x309 /* s390 vector registers 0-15 upper half */ +#define NN_S390_VXRS_HIGH "LINUX" #define NT_S390_VXRS_HIGH 0x30a /* s390 vector registers 16-31 */ +#define NN_S390_GS_CB "LINUX" #define NT_S390_GS_CB 0x30b /* s390 guarded storage registers */ +#define NN_S390_GS_BC "LINUX" #define NT_S390_GS_BC 0x30c /* s390 guarded storage broadcast control block */ +#define NN_S390_RI_CB "LINUX" #define NT_S390_RI_CB 0x30d /* s390 runtime instrumentation */ +#define NN_S390_PV_CPU_DATA "LINUX" #define NT_S390_PV_CPU_DATA 0x30e /* s390 protvirt cpu dump data */ +#define NN_ARM_VFP "LINUX" #define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ +#define NN_ARM_TLS "LINUX" #define NT_ARM_TLS 0x401 /* ARM TLS register */ +#define NN_ARM_HW_BREAK "LINUX" #define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ +#define NN_ARM_HW_WATCH "LINUX" #define NT_ARM_HW_WATCH 0x403 /* ARM hardware watchpoint registers */ +#define NN_ARM_SYSTEM_CALL "LINUX" #define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ +#define NN_ARM_SVE "LINUX" #define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension registers */ +#define NN_ARM_PAC_MASK "LINUX" #define NT_ARM_PAC_MASK 0x406 /* ARM pointer authentication code masks */ +#define NN_ARM_PACA_KEYS "LINUX" #define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#define NN_ARM_PACG_KEYS "LINUX" #define NT_ARM_PACG_KEYS 0x408 /* ARM pointer authentication generic key */ +#define NN_ARM_TAGGED_ADDR_CTRL "LINUX" #define NT_ARM_TAGGED_ADDR_CTRL 0x409 /* arm64 tagged address control (prctl()) */ +#define NN_ARM_PAC_ENABLED_KEYS "LINUX" #define NT_ARM_PAC_ENABLED_KEYS 0x40a /* arm64 ptr auth enabled keys (prctl()) */ +#define NN_ARM_SSVE "LINUX" #define NT_ARM_SSVE 0x40b /* ARM Streaming SVE registers */ +#define NN_ARM_ZA "LINUX" #define NT_ARM_ZA 0x40c /* ARM SME ZA registers */ +#define NN_ARM_ZT "LINUX" #define NT_ARM_ZT 0x40d /* ARM SME ZT registers */ +#define NN_ARM_FPMR "LINUX" #define NT_ARM_FPMR 0x40e /* ARM floating point mode register */ +#define NN_ARM_POE "LINUX" #define NT_ARM_POE 0x40f /* ARM POE registers */ +#define NN_ARM_GCS "LINUX" #define NT_ARM_GCS 0x410 /* ARM GCS state */ +#define NN_ARC_V2 "LINUX" #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ +#define NN_VMCOREDD "LINUX" #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ +#define NN_MIPS_DSP "LINUX" #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ +#define NN_MIPS_FP_MODE "LINUX" #define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */ +#define NN_MIPS_MSA "LINUX" #define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */ +#define NN_RISCV_CSR "LINUX" #define NT_RISCV_CSR 0x900 /* RISC-V Control and Status Registers */ +#define NN_RISCV_VECTOR "LINUX" #define NT_RISCV_VECTOR 0x901 /* RISC-V vector registers */ +#define NN_RISCV_TAGGED_ADDR_CTRL "LINUX" #define NT_RISCV_TAGGED_ADDR_CTRL 0x902 /* RISC-V tagged address control (prctl()) */ +#define NN_LOONGARCH_CPUCFG "LINUX" #define NT_LOONGARCH_CPUCFG 0xa00 /* LoongArch CPU config registers */ +#define NN_LOONGARCH_CSR "LINUX" #define NT_LOONGARCH_CSR 0xa01 /* LoongArch control and status registers */ +#define NN_LOONGARCH_LSX "LINUX" #define NT_LOONGARCH_LSX 0xa02 /* LoongArch Loongson SIMD Extension registers */ +#define NN_LOONGARCH_LASX "LINUX" #define NT_LOONGARCH_LASX 0xa03 /* LoongArch Loongson Advanced SIMD Extension registers */ +#define NN_LOONGARCH_LBT "LINUX" #define NT_LOONGARCH_LBT 0xa04 /* LoongArch Loongson Binary Translation registers */ +#define NN_LOONGARCH_HW_BREAK "LINUX" #define NT_LOONGARCH_HW_BREAK 0xa05 /* LoongArch hardware breakpoint registers */ +#define NN_LOONGARCH_HW_WATCH "LINUX" #define NT_LOONGARCH_HW_WATCH 0xa06 /* LoongArch hardware watchpoint registers */ -/* Note types with note name "GNU" */ -#define NT_GNU_PROPERTY_TYPE_0 5 - /* Note header in a PT_NOTE section */ typedef struct elf32_note { Elf32_Word n_namesz; /* Name size */ -- GitLab From 2fc4947bbd91201b0bc137127d8e125d48985ad4 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 15 Jan 2025 14:47:59 +0900 Subject: [PATCH 201/934] binfmt_elf: Use note name macros Use note name macros to match with the userspace's expectation. Signed-off-by: Akihiko Odaki Acked-by: Baoquan He Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20250115-elf-v5-2-0f9e55bbb2fc@daynix.com Signed-off-by: Kees Cook --- fs/binfmt_elf.c | 21 ++++++++++----------- fs/binfmt_elf_fdpic.c | 8 ++++---- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8054f44d39cf7..584fa89bc8776 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -762,8 +762,7 @@ static int parse_elf_property(const char *data, size_t *off, size_t datasz, } #define NOTE_DATA_SZ SZ_1K -#define GNU_PROPERTY_TYPE_0_NAME "GNU" -#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME)) +#define NOTE_NAME_SZ (sizeof(NN_GNU_PROPERTY_TYPE_0)) static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, struct arch_elf_state *arch) @@ -800,7 +799,7 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 || note.nhdr.n_namesz != NOTE_NAME_SZ || strncmp(note.data + sizeof(note.nhdr), - GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr))) + NN_GNU_PROPERTY_TYPE_0, n - sizeof(note.nhdr))) return -ENOEXEC; off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ, @@ -1603,14 +1602,14 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) do i += 2; while (auxv[i - 2] != AT_NULL); - fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); + fill_note(note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv); } static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, const kernel_siginfo_t *siginfo) { copy_siginfo_to_external(csigdata, siginfo); - fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); + fill_note(note, NN_SIGINFO, NT_SIGINFO, sizeof(*csigdata), csigdata); } /* @@ -1706,7 +1705,7 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm } size = name_curpos - (char *)data; - fill_note(note, "CORE", NT_FILE, size, data); + fill_note(note, NN_FILE, NT_FILE, size, data); return 0; } @@ -1767,7 +1766,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset_get(t->task, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, PRSTATUS_SIZE, &t->prstatus); info->size += notesize(&t->notes[0]); @@ -1801,7 +1800,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, if (is_fpreg) SET_PR_FPVALID(&t->prstatus); - fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX", + fill_note(&t->notes[note_iter], is_fpreg ? NN_PRFPREG : "LINUX", note_type, ret, data); info->size += notesize(&t->notes[note_iter]); @@ -1821,7 +1820,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, fill_prstatus(&t->prstatus.common, p, signr); elf_core_copy_task_regs(p, &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus), &(t->prstatus)); info->size += notesize(&t->notes[0]); @@ -1832,7 +1831,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, } t->prstatus.pr_fpvalid = 1; - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu); + fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(*fpu), fpu); info->size += notesize(&t->notes[1]); return 1; @@ -1852,7 +1851,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) return 0; - fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + fill_note(&info->psinfo, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo); #ifdef CORE_DUMP_USE_REGSET view = task_user_regset_view(dump_task); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c13ee8180b175..e3cf2801cd644 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1397,7 +1397,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ regset_get(p, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus), &t->prstatus); t->num_notes++; *sz += notesize(&t->notes[0]); @@ -1415,7 +1415,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ } if (t->prstatus.pr_fpvalid) { - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), + fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(t->fpu), &t->fpu); t->num_notes++; *sz += notesize(&t->notes[1]); @@ -1530,7 +1530,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) */ fill_psinfo(psinfo, current->group_leader, current->mm); - fill_note(&psinfo_note, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + fill_note(&psinfo_note, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo); thread_status_size += notesize(&psinfo_note); auxv = (elf_addr_t *) current->mm->saved_auxv; @@ -1538,7 +1538,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) do i += 2; while (auxv[i - 2] != AT_NULL); - fill_note(&auxv_note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); + fill_note(&auxv_note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv); thread_status_size += notesize(&auxv_note); offset = sizeof(*elf); /* ELF header */ -- GitLab From 609c8b30915697b3d520824b19b7f3a69eeda2c5 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 15 Jan 2025 14:48:00 +0900 Subject: [PATCH 202/934] powerpc/crash: Use note name macros Use note name macros to match with the userspace's expectation. Signed-off-by: Akihiko Odaki Acked-by: Baoquan He Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20250115-elf-v5-3-0f9e55bbb2fc@daynix.com Signed-off-by: Kees Cook --- arch/powerpc/kernel/fadump.c | 2 +- arch/powerpc/platforms/powernv/opal-core.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 4b371c738213c..d44349fe8e2b5 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -751,7 +751,7 @@ u32 *__init fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) * prstatus.pr_pid = ???? */ elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS, + buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); return buf; } diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index c9a9b759cc928..a379ff86c1201 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -149,7 +149,7 @@ static Elf64_Word *__init auxv_to_elf64_notes(Elf64_Word *buf, /* end of vector */ bufp[idx++] = cpu_to_be64(AT_NULL); - buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV, + buf = append_elf64_note(buf, NN_AUXV, NT_AUXV, oc_conf->auxv_buf, AUXV_DESC_SZ); return buf; } @@ -252,7 +252,7 @@ static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf) * crashing CPU's prstatus. */ first_cpu_note = buf; - buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS, + buf = append_elf64_note(buf, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) { @@ -279,7 +279,7 @@ static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf) fill_prstatus(&prstatus, thread_pir, ®s); if (thread_pir != oc_conf->crashing_cpu) { - buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, + buf = append_elf64_note(buf, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); } else { @@ -287,7 +287,7 @@ static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf) * Add crashing CPU as the first NT_PRSTATUS note for * GDB to process the core file appropriately. */ - append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME, + append_elf64_note(first_cpu_note, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); } -- GitLab From 0de47f28ec8444ead678f96e729105ca4bcc6db2 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 15 Jan 2025 14:48:01 +0900 Subject: [PATCH 203/934] crash: Use note name macros Use note name macros to match with the userspace's expectation. Signed-off-by: Akihiko Odaki Acked-by: Baoquan He Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20250115-elf-v5-4-0f9e55bbb2fc@daynix.com Signed-off-by: Kees Cook --- fs/proc/kcore.c | 12 ++++++------ include/linux/vmcore_info.h | 2 +- kernel/crash_core.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 1cb33771bf9fc..728630b10fdff 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -34,8 +34,6 @@ #include #include "internal.h" -#define CORE_STR "CORE" - #ifndef ELF_CORE_EFLAGS #define ELF_CORE_EFLAGS 0 #endif @@ -122,7 +120,9 @@ static void update_kcore_size(void) kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr); kcore_notes_len = (4 * sizeof(struct elf_note) + - 3 * ALIGN(sizeof(CORE_STR), 4) + + ALIGN(sizeof(NN_PRSTATUS), 4) + + ALIGN(sizeof(NN_PRPSINFO), 4) + + ALIGN(sizeof(NN_TASKSTRUCT), 4) + VMCOREINFO_NOTE_NAME_BYTES + ALIGN(sizeof(struct elf_prstatus), 4) + ALIGN(sizeof(struct elf_prpsinfo), 4) + @@ -443,11 +443,11 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) goto out; } - append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus, + append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); - append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo, + append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo)); - append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current, + append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, current, arch_task_struct_size); /* * vmcoreinfo_size is mostly constant after init time, but it diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h index e1dec1a6a749d..1672801fd98c0 100644 --- a/include/linux/vmcore_info.h +++ b/include/linux/vmcore_info.h @@ -8,7 +8,7 @@ #define CRASH_CORE_NOTE_NAME "CORE" #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) -#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) +#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4) #define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) /* diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 078fe5bc5a747..335b8425dd4b9 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -436,7 +436,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) memset(&prstatus, 0, sizeof(prstatus)); prstatus.common.pr_pid = current->pid; elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); final_note(buf); } -- GitLab From d4a760fb77fdac07efa3da4fa4a18f49f178d048 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 15 Jan 2025 14:48:02 +0900 Subject: [PATCH 204/934] s390/crash: Use note name macros Use note name macros to match with the userspace's expectation. Signed-off-by: Akihiko Odaki Acked-by: Heiko Carstens Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20250115-elf-v5-5-0f9e55bbb2fc@daynix.com Signed-off-by: Kees Cook --- arch/s390/kernel/crash_dump.c | 62 +++++++++++++---------------------- 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 276cb4c1e11be..4a981266b4833 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -246,15 +246,6 @@ bool is_kdump_kernel(void) } EXPORT_SYMBOL_GPL(is_kdump_kernel); -static const char *nt_name(Elf64_Word type) -{ - const char *name = "LINUX"; - - if (type == NT_PRPSINFO || type == NT_PRSTATUS || type == NT_PRFPREG) - name = KEXEC_CORE_NOTE_NAME; - return name; -} - /* * Initialize ELF note */ @@ -279,10 +270,8 @@ static void *nt_init_name(void *buf, Elf64_Word type, void *desc, int d_len, return PTR_ADD(buf, len); } -static inline void *nt_init(void *buf, Elf64_Word type, void *desc, int d_len) -{ - return nt_init_name(buf, type, desc, d_len, nt_name(type)); -} +#define nt_init(buf, type, desc) \ + nt_init_name(buf, NT_ ## type, &(desc), sizeof(desc), NN_ ## type) /* * Calculate the size of ELF note @@ -298,10 +287,7 @@ static size_t nt_size_name(int d_len, const char *name) return size; } -static inline size_t nt_size(Elf64_Word type, int d_len) -{ - return nt_size_name(d_len, nt_name(type)); -} +#define nt_size(type, desc) nt_size_name(sizeof(desc), NN_ ## type) /* * Fill ELF notes for one CPU with save area registers @@ -322,18 +308,16 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa) memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc)); memcpy(&nt_fpregset.fprs, &sa->fprs, sizeof(sa->fprs)); /* Create ELF notes for the CPU */ - ptr = nt_init(ptr, NT_PRSTATUS, &nt_prstatus, sizeof(nt_prstatus)); - ptr = nt_init(ptr, NT_PRFPREG, &nt_fpregset, sizeof(nt_fpregset)); - ptr = nt_init(ptr, NT_S390_TIMER, &sa->timer, sizeof(sa->timer)); - ptr = nt_init(ptr, NT_S390_TODCMP, &sa->todcmp, sizeof(sa->todcmp)); - ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg)); - ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs)); - ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix)); + ptr = nt_init(ptr, PRSTATUS, nt_prstatus); + ptr = nt_init(ptr, PRFPREG, nt_fpregset); + ptr = nt_init(ptr, S390_TIMER, sa->timer); + ptr = nt_init(ptr, S390_TODCMP, sa->todcmp); + ptr = nt_init(ptr, S390_TODPREG, sa->todpreg); + ptr = nt_init(ptr, S390_CTRS, sa->ctrs); + ptr = nt_init(ptr, S390_PREFIX, sa->prefix); if (cpu_has_vx()) { - ptr = nt_init(ptr, NT_S390_VXRS_HIGH, - &sa->vxrs_high, sizeof(sa->vxrs_high)); - ptr = nt_init(ptr, NT_S390_VXRS_LOW, - &sa->vxrs_low, sizeof(sa->vxrs_low)); + ptr = nt_init(ptr, S390_VXRS_HIGH, sa->vxrs_high); + ptr = nt_init(ptr, S390_VXRS_LOW, sa->vxrs_low); } return ptr; } @@ -346,16 +330,16 @@ static size_t get_cpu_elf_notes_size(void) struct save_area *sa = NULL; size_t size; - size = nt_size(NT_PRSTATUS, sizeof(struct elf_prstatus)); - size += nt_size(NT_PRFPREG, sizeof(elf_fpregset_t)); - size += nt_size(NT_S390_TIMER, sizeof(sa->timer)); - size += nt_size(NT_S390_TODCMP, sizeof(sa->todcmp)); - size += nt_size(NT_S390_TODPREG, sizeof(sa->todpreg)); - size += nt_size(NT_S390_CTRS, sizeof(sa->ctrs)); - size += nt_size(NT_S390_PREFIX, sizeof(sa->prefix)); + size = nt_size(PRSTATUS, struct elf_prstatus); + size += nt_size(PRFPREG, elf_fpregset_t); + size += nt_size(S390_TIMER, sa->timer); + size += nt_size(S390_TODCMP, sa->todcmp); + size += nt_size(S390_TODPREG, sa->todpreg); + size += nt_size(S390_CTRS, sa->ctrs); + size += nt_size(S390_PREFIX, sa->prefix); if (cpu_has_vx()) { - size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high)); - size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low)); + size += nt_size(S390_VXRS_HIGH, sa->vxrs_high); + size += nt_size(S390_VXRS_LOW, sa->vxrs_low); } return size; @@ -371,7 +355,7 @@ static void *nt_prpsinfo(void *ptr) memset(&prpsinfo, 0, sizeof(prpsinfo)); prpsinfo.pr_sname = 'R'; strcpy(prpsinfo.pr_fname, "vmlinux"); - return nt_init(ptr, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo)); + return nt_init(ptr, PRPSINFO, prpsinfo); } /* @@ -610,7 +594,7 @@ static size_t get_elfcorehdr_size(int phdr_count) /* PT_NOTES */ size += sizeof(Elf64_Phdr); /* nt_prpsinfo */ - size += nt_size(NT_PRPSINFO, sizeof(struct elf_prpsinfo)); + size += nt_size(PRPSINFO, struct elf_prpsinfo); /* regsets */ size += get_cpu_cnt() * get_cpu_elf_notes_size(); /* nt_vmcoreinfo */ -- GitLab From 7e620b56d958a5efcb0158c366581e0637fd9a50 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 15 Jan 2025 14:48:03 +0900 Subject: [PATCH 205/934] crash: Remove KEXEC_CORE_NOTE_NAME KEXEC_CORE_NOTE_NAME is no longer used. Signed-off-by: Akihiko Odaki Acked-by: Baoquan He Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20250115-elf-v5-6-0f9e55bbb2fc@daynix.com Signed-off-by: Kees Cook --- include/linux/kexec.h | 2 -- include/linux/vmcore_info.h | 1 - 2 files changed, 3 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index f0e9f8eda7a3c..c840431eaddad 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -68,8 +68,6 @@ extern note_buf_t __percpu *crash_notes; #define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE #endif -#define KEXEC_CORE_NOTE_NAME CRASH_CORE_NOTE_NAME - /* * This structure is used to hold the arguments that are used when loading * kernel binaries. diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h index 1672801fd98c0..37e003ae52626 100644 --- a/include/linux/vmcore_info.h +++ b/include/linux/vmcore_info.h @@ -6,7 +6,6 @@ #include #include -#define CRASH_CORE_NOTE_NAME "CORE" #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) #define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4) #define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) -- GitLab From 3e50ba8fc834cadead733e4feeb969fce2f3b6e1 Mon Sep 17 00:00:00 2001 From: Luis Felipe Hernandez Date: Mon, 2 Dec 2024 15:55:38 +0800 Subject: [PATCH 206/934] lib: math: Move KUnit tests into tests/ subdir This patch is a follow-up task from a discussion stemming from point 3 in a recent patch introducing the int_pow kunit test [1] and documentation regarding kunit test style and nomenclature [2]. Colocate all kunit test suites in lib/math/tests/ and follow recommended naming convention for files _kunit.c and kconfig entries CONFIG__KUNIT_TEST. Link: https://lore.kernel.org/all/CABVgOS=-vh5TqHFCq_jo=ffq8v_nGgr6JsPnOZag3e6+19ysxQ@mail.gmail.com/ [1] Link: https://docs.kernel.org/dev-tools/kunit/style.html [2] Signed-off-by: Luis Felipe Hernandez Acked-by: Nicolas Pitre Reviewed-by: David Gow Reviewed-by: Rae Moar Link: https://lore.kernel.org/r/20241202075545.3648096-2-davidgow@google.com Signed-off-by: Kees Cook --- lib/Kconfig.debug | 2 +- lib/math/Makefile | 5 ++--- lib/math/tests/Makefile | 5 +++-- lib/math/{rational-test.c => tests/rational_kunit.c} | 0 4 files changed, 6 insertions(+), 6 deletions(-) rename lib/math/{rational-test.c => tests/rational_kunit.c} (100%) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1af972a92d06f..b18dbfc53ca4b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -3166,7 +3166,7 @@ config TEST_OBJPOOL If unsure, say N. -config INT_POW_TEST +config INT_POW_KUNIT_TEST tristate "Integer exponentiation (int_pow) test" if !KUNIT_ALL_TESTS depends on KUNIT default KUNIT_ALL_TESTS diff --git a/lib/math/Makefile b/lib/math/Makefile index 853f023ae5370..d1caba23baa0b 100644 --- a/lib/math/Makefile +++ b/lib/math/Makefile @@ -5,8 +5,7 @@ obj-$(CONFIG_CORDIC) += cordic.o obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o obj-$(CONFIG_RATIONAL) += rational.o -obj-$(CONFIG_INT_POW_TEST) += tests/int_pow_kunit.o obj-$(CONFIG_TEST_DIV64) += test_div64.o obj-$(CONFIG_TEST_MULDIV64) += test_mul_u64_u64_div_u64.o -obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o -obj-$(CONFIG_INT_SQRT_KUNIT_TEST) += tests/int_sqrt_kunit.o \ No newline at end of file + +obj-y += tests/ diff --git a/lib/math/tests/Makefile b/lib/math/tests/Makefile index e1a79f093b2d0..a982f71800bdd 100644 --- a/lib/math/tests/Makefile +++ b/lib/math/tests/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_INT_POW_TEST) += int_pow_kunit.o -obj-$(CONFIG_INT_SQRT_KUNIT_TEST) += int_sqrt_kunit.o +obj-$(CONFIG_INT_POW_KUNIT_TEST) += int_pow_kunit.o +obj-$(CONFIG_INT_SQRT_KUNIT_TEST) += int_sqrt_kunit.o +obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational_kunit.o diff --git a/lib/math/rational-test.c b/lib/math/tests/rational_kunit.c similarity index 100% rename from lib/math/rational-test.c rename to lib/math/tests/rational_kunit.c -- GitLab From 84ec093f55f58f5a4a66eb98bd6b6af413190bde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bruno=20Sobreira=20Fran=C3=A7a?= Date: Mon, 2 Dec 2024 15:55:39 +0800 Subject: [PATCH 207/934] lib/math: Add int_log test suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit introduces KUnit tests for the intlog2 and intlog10 functions, which compute logarithms in base 2 and base 10, respectively. The tests cover a range of inputs to ensure the correctness of these functions across common and edge cases. Signed-off-by: Bruno Sobreira França Reviewed-by: David Gow Reviewed-by: Rae Moar Link: https://lore.kernel.org/r/20241202075545.3648096-3-davidgow@google.com Signed-off-by: Kees Cook --- lib/Kconfig.debug | 11 +++++ lib/math/tests/Makefile | 1 + lib/math/tests/int_log_kunit.c | 74 ++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+) create mode 100644 lib/math/tests/int_log_kunit.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b18dbfc53ca4b..f30929ed1a84f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -3197,6 +3197,17 @@ config INT_SQRT_KUNIT_TEST If unsure, say N +config INT_LOG_KUNIT_TEST + tristate "Integer log (int_log) test" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This option enables the KUnit test suite for the int_log library, which + provides two functions to compute the integer logarithm in base 2 and + base 10, called respectively as intlog2 and intlog10. + + If unsure, say N + endif # RUNTIME_TESTING_MENU config ARCH_USE_MEMTEST diff --git a/lib/math/tests/Makefile b/lib/math/tests/Makefile index a982f71800bdd..14a245778394d 100644 --- a/lib/math/tests/Makefile +++ b/lib/math/tests/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_INT_LOG_KUNIT_TEST) += int_log_kunit.o obj-$(CONFIG_INT_POW_KUNIT_TEST) += int_pow_kunit.o obj-$(CONFIG_INT_SQRT_KUNIT_TEST) += int_sqrt_kunit.o obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational_kunit.o diff --git a/lib/math/tests/int_log_kunit.c b/lib/math/tests/int_log_kunit.c new file mode 100644 index 0000000000000..14e854146cb44 --- /dev/null +++ b/lib/math/tests/int_log_kunit.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +struct test_case_params { + u32 value; + unsigned int expected_result; + const char *name; +}; + + +/* The expected result takes into account the log error */ +static const struct test_case_params intlog2_params[] = { + {0, 0, "Log base 2 of 0"}, + {1, 0, "Log base 2 of 1"}, + {2, 16777216, "Log base 2 of 2"}, + {3, 26591232, "Log base 2 of 3"}, + {4, 33554432, "Log base 2 of 4"}, + {8, 50331648, "Log base 2 of 8"}, + {16, 67108864, "Log base 2 of 16"}, + {32, 83886080, "Log base 2 of 32"}, + {U32_MAX, 536870911, "Log base 2 of MAX"}, +}; + +static const struct test_case_params intlog10_params[] = { + {0, 0, "Log base 10 of 0"}, + {1, 0, "Log base 10 of 1"}, + {6, 13055203, "Log base 10 of 6"}, + {10, 16777225, "Log base 10 of 10"}, + {100, 33554450, "Log base 10 of 100"}, + {1000, 50331675, "Log base 10 of 1000"}, + {10000, 67108862, "Log base 10 of 10000"}, + {U32_MAX, 161614247, "Log base 10 of MAX"} +}; + +static void get_desc(const struct test_case_params *tc, char *desc) +{ + strscpy(desc, tc->name, KUNIT_PARAM_DESC_SIZE); +} + + +KUNIT_ARRAY_PARAM(intlog2, intlog2_params, get_desc); + +static void intlog2_test(struct kunit *test) +{ + const struct test_case_params *tc = (const struct test_case_params *)test->param_value; + + KUNIT_EXPECT_EQ(test, tc->expected_result, intlog2(tc->value)); +} + +KUNIT_ARRAY_PARAM(intlog10, intlog10_params, get_desc); + +static void intlog10_test(struct kunit *test) +{ + const struct test_case_params *tc = (const struct test_case_params *)test->param_value; + + KUNIT_EXPECT_EQ(test, tc->expected_result, intlog10(tc->value)); +} + +static struct kunit_case math_int_log_test_cases[] = { + KUNIT_CASE_PARAM(intlog2_test, intlog2_gen_params), + KUNIT_CASE_PARAM(intlog10_test, intlog10_gen_params), + {} +}; + +static struct kunit_suite int_log_test_suite = { + .name = "math-int_log", + .test_cases = math_int_log_test_cases, +}; + +kunit_test_suites(&int_log_test_suite); + +MODULE_DESCRIPTION("math.int_log KUnit test suite"); +MODULE_LICENSE("GPL"); -- GitLab From db6fe4d61ece24193eb4d94a82d967501d53358c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 2 Dec 2024 15:55:40 +0800 Subject: [PATCH 208/934] lib: Move KUnit tests into tests/ subdirectory Following from the recent KUnit file naming discussion[1], move all KUnit tests in lib/ into lib/tests/. Link: https://lore.kernel.org/lkml/20240720165441.it.320-kees@kernel.org/ [1] Acked-by: Steven Rostedt (Google) Acked-by: Jakub Kicinski Acked-by: Masami Hiramatsu (Google) Reviewed-by: David Gow Acked-by: Vlastimil Babka Reviewed-by: Rae Moar Link: https://lore.kernel.org/r/20241202075545.3648096-4-davidgow@google.com Signed-off-by: Kees Cook --- MAINTAINERS | 19 ++++++------ lib/Makefile | 39 +------------------------ lib/tests/Makefile | 40 ++++++++++++++++++++++++++ lib/{ => tests}/bitfield_kunit.c | 0 lib/{ => tests}/checksum_kunit.c | 0 lib/{ => tests}/cmdline_kunit.c | 0 lib/{ => tests}/cpumask_kunit.c | 0 lib/{ => tests}/crc_kunit.c | 0 lib/{ => tests}/fortify_kunit.c | 0 lib/{ => tests}/hashtable_test.c | 0 lib/{ => tests}/is_signed_type_kunit.c | 0 lib/{ => tests}/kunit_iov_iter.c | 0 lib/{ => tests}/list-test.c | 0 lib/{ => tests}/memcpy_kunit.c | 0 lib/{ => tests}/overflow_kunit.c | 0 lib/{ => tests}/siphash_kunit.c | 0 lib/{ => tests}/slub_kunit.c | 0 lib/{ => tests}/stackinit_kunit.c | 0 lib/{ => tests}/string_helpers_kunit.c | 0 lib/{ => tests}/string_kunit.c | 0 lib/{ => tests}/test_bits.c | 0 lib/{ => tests}/test_fprobe.c | 0 lib/{ => tests}/test_hash.c | 0 lib/{ => tests}/test_kprobes.c | 0 lib/{ => tests}/test_linear_ranges.c | 0 lib/{ => tests}/test_list_sort.c | 0 lib/{ => tests}/test_sort.c | 0 lib/{ => tests}/usercopy_kunit.c | 0 lib/{ => tests}/util_macros_kunit.c | 0 29 files changed, 51 insertions(+), 47 deletions(-) rename lib/{ => tests}/bitfield_kunit.c (100%) rename lib/{ => tests}/checksum_kunit.c (100%) rename lib/{ => tests}/cmdline_kunit.c (100%) rename lib/{ => tests}/cpumask_kunit.c (100%) rename lib/{ => tests}/crc_kunit.c (100%) rename lib/{ => tests}/fortify_kunit.c (100%) rename lib/{ => tests}/hashtable_test.c (100%) rename lib/{ => tests}/is_signed_type_kunit.c (100%) rename lib/{ => tests}/kunit_iov_iter.c (100%) rename lib/{ => tests}/list-test.c (100%) rename lib/{ => tests}/memcpy_kunit.c (100%) rename lib/{ => tests}/overflow_kunit.c (100%) rename lib/{ => tests}/siphash_kunit.c (100%) rename lib/{ => tests}/slub_kunit.c (100%) rename lib/{ => tests}/stackinit_kunit.c (100%) rename lib/{ => tests}/string_helpers_kunit.c (100%) rename lib/{ => tests}/string_kunit.c (100%) rename lib/{ => tests}/test_bits.c (100%) rename lib/{ => tests}/test_fprobe.c (100%) rename lib/{ => tests}/test_hash.c (100%) rename lib/{ => tests}/test_kprobes.c (100%) rename lib/{ => tests}/test_linear_ranges.c (100%) rename lib/{ => tests}/test_list_sort.c (100%) rename lib/{ => tests}/test_sort.c (100%) rename lib/{ => tests}/usercopy_kunit.c (100%) rename lib/{ => tests}/util_macros_kunit.c (100%) diff --git a/MAINTAINERS b/MAINTAINERS index 25c86f47353de..78be36fb1a744 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4017,10 +4017,10 @@ F: include/vdso/bits.h F: lib/bitmap-str.c F: lib/bitmap.c F: lib/cpumask.c -F: lib/cpumask_kunit.c F: lib/find_bit.c F: lib/find_bit_benchmark.c F: lib/test_bitmap.c +F: lib/tests/cpumask_kunit.c F: tools/include/linux/bitfield.h F: tools/include/linux/bitmap.h F: tools/include/linux/bits.h @@ -9065,9 +9065,10 @@ L: linux-hardening@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening F: include/linux/fortify-string.h -F: lib/fortify_kunit.c -F: lib/memcpy_kunit.c F: lib/test_fortify/* +F: lib/tests/fortify_kunit.c +F: lib/tests/memcpy_kunit.c +F: scripts/test_fortify.sh K: \bunsafe_memcpy\b K: \b__NO_FORTIFY\b @@ -9755,9 +9756,9 @@ F: include/linux/string.h F: include/linux/string_choices.h F: include/linux/string_helpers.h F: lib/string.c -F: lib/string_kunit.c F: lib/string_helpers.c -F: lib/string_helpers_kunit.c +F: lib/tests/string_helpers_kunit.c +F: lib/tests/string_kunit.c F: scripts/coccinelle/api/string_choices.cocci GENERIC UIO DRIVER FOR PCI DEVICES @@ -12985,7 +12986,7 @@ F: Documentation/trace/kprobes.rst F: include/asm-generic/kprobes.h F: include/linux/kprobes.h F: kernel/kprobes.c -F: lib/test_kprobes.c +F: lib/tests/test_kprobes.c F: samples/kprobes KS0108 LCD CONTROLLER DRIVER @@ -13315,7 +13316,7 @@ M: Mark Brown R: Matti Vaittinen F: include/linux/linear_range.h F: lib/linear_ranges.c -F: lib/test_linear_ranges.c +F: lib/tests/test_linear_ranges.c LINUX FOR POWER MACINTOSH L: linuxppc-dev@lists.ozlabs.org @@ -13443,7 +13444,7 @@ M: David Gow L: linux-kselftest@vger.kernel.org L: kunit-dev@googlegroups.com S: Maintained -F: lib/list-test.c +F: lib/tests/list-test.c LITEX PLATFORM M: Karol Gugala @@ -21732,7 +21733,7 @@ M: Jason A. Donenfeld S: Maintained F: include/linux/siphash.h F: lib/siphash.c -F: lib/siphash_kunit.c +F: lib/tests/siphash_kunit.c SIS 190 ETHERNET DRIVER M: Francois Romieu diff --git a/lib/Makefile b/lib/Makefile index d5cfc7afbbb82..1e886482a6a37 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -52,9 +52,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \ percpu-refcount.o rhashtable.o base64.o \ once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \ generic-radix-tree.o bitmap-str.o -obj-$(CONFIG_STRING_KUNIT_TEST) += string_kunit.o obj-y += string_helpers.o -obj-$(CONFIG_STRING_HELPERS_KUNIT_TEST) += string_helpers_kunit.o obj-y += hexdump.o obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o obj-y += kstrtox.o @@ -65,22 +63,17 @@ obj-$(CONFIG_TEST_DHRY) += test_dhry.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_BITOPS) += test_bitops.o CFLAGS_test_bitops.o += -Werror -obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o -obj-$(CONFIG_TEST_IOV_ITER) += kunit_iov_iter.o -obj-$(CONFIG_HASH_KUNIT_TEST) += test_hash.o obj-$(CONFIG_TEST_IDA) += test_ida.o obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o CFLAGS_test_ubsan.o += $(call cc-disable-warning, vla) CFLAGS_test_ubsan.o += $(call cc-disable-warning, unused-but-set-variable) UBSAN_SANITIZE_test_ubsan.o := y obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o -obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_TEST_MIN_HEAP) += test_min_heap.o obj-$(CONFIG_TEST_LKM) += test_module.o obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o -obj-$(CONFIG_TEST_SORT) += test_sort.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o obj-$(CONFIG_TEST_DYNAMIC_DEBUG) += test_dynamic_debug.o @@ -98,7 +91,6 @@ obj-$(CONFIG_TEST_XARRAY) += test_xarray.o obj-$(CONFIG_TEST_MAPLE_TREE) += test_maple_tree.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o obj-$(CONFIG_TEST_KMOD) += test_kmod.o -obj-$(CONFIG_TEST_RUNTIME) += tests/ obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o @@ -107,10 +99,7 @@ obj-$(CONFIG_TEST_MEMINIT) += test_meminit.o obj-$(CONFIG_TEST_LOCKUP) += test_lockup.o obj-$(CONFIG_TEST_HMM) += test_hmm.o obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o -obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_TEST_REF_TRACKER) += test_ref_tracker.o -CFLAGS_test_fprobe.o += $(CC_FLAGS_FTRACE) -obj-$(CONFIG_FPROBE_SANITY_TEST) += test_fprobe.o obj-$(CONFIG_TEST_OBJPOOL) += test_objpool.o obj-$(CONFIG_TEST_FPU) += test_fpu.o @@ -132,7 +121,7 @@ endif obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any) -obj-y += math/ crypto/ +obj-y += math/ crypto/ tests/ obj-$(CONFIG_GENERIC_IOMAP) += iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o @@ -368,32 +357,6 @@ obj-$(CONFIG_OBJAGG) += objagg.o # pldmfw library obj-$(CONFIG_PLDMFW) += pldmfw/ -# KUnit tests -CFLAGS_bitfield_kunit.o := $(DISABLE_STRUCTLEAK_PLUGIN) -obj-$(CONFIG_BITFIELD_KUNIT) += bitfield_kunit.o -obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o -obj-$(CONFIG_UTIL_MACROS_KUNIT) += util_macros_kunit.o -obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o -obj-$(CONFIG_HASHTABLE_KUNIT_TEST) += hashtable_test.o -obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o -obj-$(CONFIG_BITS_TEST) += test_bits.o -obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o -obj-$(CONFIG_SLUB_KUNIT_TEST) += slub_kunit.o -obj-$(CONFIG_MEMCPY_KUNIT_TEST) += memcpy_kunit.o -obj-$(CONFIG_IS_SIGNED_TYPE_KUNIT_TEST) += is_signed_type_kunit.o -CFLAGS_overflow_kunit.o = $(call cc-disable-warning, tautological-constant-out-of-range-compare) -obj-$(CONFIG_OVERFLOW_KUNIT_TEST) += overflow_kunit.o -CFLAGS_stackinit_kunit.o += $(call cc-disable-warning, switch-unreachable) -obj-$(CONFIG_STACKINIT_KUNIT_TEST) += stackinit_kunit.o -CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced) -CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-overread) -CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-truncation) -CFLAGS_fortify_kunit.o += $(DISABLE_STRUCTLEAK_PLUGIN) -obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fortify_kunit.o -obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o -obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o -obj-$(CONFIG_USERCOPY_KUNIT_TEST) += usercopy_kunit.o - obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o obj-$(CONFIG_FIRMWARE_TABLE) += fw_table.o diff --git a/lib/tests/Makefile b/lib/tests/Makefile index 8e4f42cb9c54f..c7b5170359c1e 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -1 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for tests of kernel library functions. + +# KUnit tests +CFLAGS_bitfield_kunit.o := $(DISABLE_STRUCTLEAK_PLUGIN) +obj-$(CONFIG_BITFIELD_KUNIT) += bitfield_kunit.o +obj-$(CONFIG_BITS_TEST) += test_bits.o +obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o +obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o +obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o +obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o +CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced) +CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-overread) +CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-truncation) +CFLAGS_fortify_kunit.o += $(DISABLE_STRUCTLEAK_PLUGIN) +obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fortify_kunit.o +CFLAGS_test_fprobe.o += $(CC_FLAGS_FTRACE) +obj-$(CONFIG_FPROBE_SANITY_TEST) += test_fprobe.o +obj-$(CONFIG_HASHTABLE_KUNIT_TEST) += hashtable_test.o +obj-$(CONFIG_HASH_KUNIT_TEST) += test_hash.o +obj-$(CONFIG_TEST_IOV_ITER) += kunit_iov_iter.o +obj-$(CONFIG_IS_SIGNED_TYPE_KUNIT_TEST) += is_signed_type_kunit.o +obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o +obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o +obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o +obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o +obj-$(CONFIG_MEMCPY_KUNIT_TEST) += memcpy_kunit.o +CFLAGS_overflow_kunit.o = $(call cc-disable-warning, tautological-constant-out-of-range-compare) +obj-$(CONFIG_OVERFLOW_KUNIT_TEST) += overflow_kunit.o +obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o +obj-$(CONFIG_SLUB_KUNIT_TEST) += slub_kunit.o +obj-$(CONFIG_TEST_SORT) += test_sort.o +CFLAGS_stackinit_kunit.o += $(call cc-disable-warning, switch-unreachable) +obj-$(CONFIG_STACKINIT_KUNIT_TEST) += stackinit_kunit.o +obj-$(CONFIG_STRING_KUNIT_TEST) += string_kunit.o +obj-$(CONFIG_STRING_HELPERS_KUNIT_TEST) += string_helpers_kunit.o +obj-$(CONFIG_USERCOPY_KUNIT_TEST) += usercopy_kunit.o +obj-$(CONFIG_UTIL_MACROS_KUNIT) += util_macros_kunit.o + obj-$(CONFIG_TEST_RUNTIME_MODULE) += module/ diff --git a/lib/bitfield_kunit.c b/lib/tests/bitfield_kunit.c similarity index 100% rename from lib/bitfield_kunit.c rename to lib/tests/bitfield_kunit.c diff --git a/lib/checksum_kunit.c b/lib/tests/checksum_kunit.c similarity index 100% rename from lib/checksum_kunit.c rename to lib/tests/checksum_kunit.c diff --git a/lib/cmdline_kunit.c b/lib/tests/cmdline_kunit.c similarity index 100% rename from lib/cmdline_kunit.c rename to lib/tests/cmdline_kunit.c diff --git a/lib/cpumask_kunit.c b/lib/tests/cpumask_kunit.c similarity index 100% rename from lib/cpumask_kunit.c rename to lib/tests/cpumask_kunit.c diff --git a/lib/crc_kunit.c b/lib/tests/crc_kunit.c similarity index 100% rename from lib/crc_kunit.c rename to lib/tests/crc_kunit.c diff --git a/lib/fortify_kunit.c b/lib/tests/fortify_kunit.c similarity index 100% rename from lib/fortify_kunit.c rename to lib/tests/fortify_kunit.c diff --git a/lib/hashtable_test.c b/lib/tests/hashtable_test.c similarity index 100% rename from lib/hashtable_test.c rename to lib/tests/hashtable_test.c diff --git a/lib/is_signed_type_kunit.c b/lib/tests/is_signed_type_kunit.c similarity index 100% rename from lib/is_signed_type_kunit.c rename to lib/tests/is_signed_type_kunit.c diff --git a/lib/kunit_iov_iter.c b/lib/tests/kunit_iov_iter.c similarity index 100% rename from lib/kunit_iov_iter.c rename to lib/tests/kunit_iov_iter.c diff --git a/lib/list-test.c b/lib/tests/list-test.c similarity index 100% rename from lib/list-test.c rename to lib/tests/list-test.c diff --git a/lib/memcpy_kunit.c b/lib/tests/memcpy_kunit.c similarity index 100% rename from lib/memcpy_kunit.c rename to lib/tests/memcpy_kunit.c diff --git a/lib/overflow_kunit.c b/lib/tests/overflow_kunit.c similarity index 100% rename from lib/overflow_kunit.c rename to lib/tests/overflow_kunit.c diff --git a/lib/siphash_kunit.c b/lib/tests/siphash_kunit.c similarity index 100% rename from lib/siphash_kunit.c rename to lib/tests/siphash_kunit.c diff --git a/lib/slub_kunit.c b/lib/tests/slub_kunit.c similarity index 100% rename from lib/slub_kunit.c rename to lib/tests/slub_kunit.c diff --git a/lib/stackinit_kunit.c b/lib/tests/stackinit_kunit.c similarity index 100% rename from lib/stackinit_kunit.c rename to lib/tests/stackinit_kunit.c diff --git a/lib/string_helpers_kunit.c b/lib/tests/string_helpers_kunit.c similarity index 100% rename from lib/string_helpers_kunit.c rename to lib/tests/string_helpers_kunit.c diff --git a/lib/string_kunit.c b/lib/tests/string_kunit.c similarity index 100% rename from lib/string_kunit.c rename to lib/tests/string_kunit.c diff --git a/lib/test_bits.c b/lib/tests/test_bits.c similarity index 100% rename from lib/test_bits.c rename to lib/tests/test_bits.c diff --git a/lib/test_fprobe.c b/lib/tests/test_fprobe.c similarity index 100% rename from lib/test_fprobe.c rename to lib/tests/test_fprobe.c diff --git a/lib/test_hash.c b/lib/tests/test_hash.c similarity index 100% rename from lib/test_hash.c rename to lib/tests/test_hash.c diff --git a/lib/test_kprobes.c b/lib/tests/test_kprobes.c similarity index 100% rename from lib/test_kprobes.c rename to lib/tests/test_kprobes.c diff --git a/lib/test_linear_ranges.c b/lib/tests/test_linear_ranges.c similarity index 100% rename from lib/test_linear_ranges.c rename to lib/tests/test_linear_ranges.c diff --git a/lib/test_list_sort.c b/lib/tests/test_list_sort.c similarity index 100% rename from lib/test_list_sort.c rename to lib/tests/test_list_sort.c diff --git a/lib/test_sort.c b/lib/tests/test_sort.c similarity index 100% rename from lib/test_sort.c rename to lib/tests/test_sort.c diff --git a/lib/usercopy_kunit.c b/lib/tests/usercopy_kunit.c similarity index 100% rename from lib/usercopy_kunit.c rename to lib/tests/usercopy_kunit.c diff --git a/lib/util_macros_kunit.c b/lib/tests/util_macros_kunit.c similarity index 100% rename from lib/util_macros_kunit.c rename to lib/tests/util_macros_kunit.c -- GitLab From 4d557cb4998654441a1018a9f6550d59098c0c9d Mon Sep 17 00:00:00 2001 From: Diego Vieira Date: Mon, 2 Dec 2024 15:55:41 +0800 Subject: [PATCH 209/934] lib/tests/kfifo_kunit.c: add tests for the kfifo structure Add KUnit tests for the kfifo data structure. They test the vast majority of macros defined in the kfifo header (include/linux/kfifo.h). These are inspired by the existing tests for the doubly linked list in lib/tests/list-test.c (previously at lib/list-test.c) [1]. Note that this patch depends on the patch that moves the KUnit tests on lib/ into lib/tests/ [2]. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/list-test.c?h=v6.11-rc6 [2] https://lore.kernel.org/all/20240720181025.work.002-kees@kernel.org/ Signed-off-by: Diego Vieira Reviewed-by: David Gow Reviewed-by: Rae Moar Link: https://lore.kernel.org/r/20241202075545.3648096-5-davidgow@google.com Signed-off-by: Kees Cook --- lib/Kconfig.debug | 14 +++ lib/tests/Makefile | 1 + lib/tests/kfifo_kunit.c | 224 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 239 insertions(+) create mode 100644 lib/tests/kfifo_kunit.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f30929ed1a84f..86ddf568cbca5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2691,6 +2691,20 @@ config SYSCTL_KUNIT_TEST If unsure, say N. +config KFIFO_KUNIT_TEST + tristate "KUnit Test for the generic kernel FIFO implementation" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This builds the generic FIFO implementation KUnit test suite. + It tests that the API and basic functionality of the kfifo type + and associated macros. + + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. + config LIST_KUNIT_TEST tristate "KUnit Test for Kernel Linked-list structures" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/tests/Makefile b/lib/tests/Makefile index c7b5170359c1e..8696d778d92f3 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_TEST_IOV_ITER) += kunit_iov_iter.o obj-$(CONFIG_IS_SIGNED_TYPE_KUNIT_TEST) += is_signed_type_kunit.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o +obj-$(CONFIG_KFIFO_KUNIT_TEST) += kfifo_kunit.o obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o obj-$(CONFIG_MEMCPY_KUNIT_TEST) += memcpy_kunit.o diff --git a/lib/tests/kfifo_kunit.c b/lib/tests/kfifo_kunit.c new file mode 100644 index 0000000000000..a85eedc3195ad --- /dev/null +++ b/lib/tests/kfifo_kunit.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test for the generic kernel FIFO implementation. + * + * Copyright (C) 2024 Diego Vieira + */ +#include + +#include + +#define KFIFO_SIZE 32 +#define N_ELEMENTS 5 + +static void kfifo_test_reset_should_clear_the_fifo(struct kunit *test) +{ + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + kfifo_put(&my_fifo, 1); + kfifo_put(&my_fifo, 2); + kfifo_put(&my_fifo, 3); + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 3); + + kfifo_reset(&my_fifo); + + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 0); + KUNIT_EXPECT_TRUE(test, kfifo_is_empty(&my_fifo)); +} + +static void kfifo_test_define_should_define_an_empty_fifo(struct kunit *test) +{ + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + KUNIT_EXPECT_TRUE(test, kfifo_initialized(&my_fifo)); + KUNIT_EXPECT_TRUE(test, kfifo_is_empty(&my_fifo)); + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 0); +} + +static void kfifo_test_len_should_ret_n_of_stored_elements(struct kunit *test) +{ + u8 buffer1[N_ELEMENTS]; + + for (int i = 0; i < N_ELEMENTS; i++) + buffer1[i] = i + 1; + + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 0); + + kfifo_in(&my_fifo, buffer1, N_ELEMENTS); + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), N_ELEMENTS); + + kfifo_in(&my_fifo, buffer1, N_ELEMENTS); + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), N_ELEMENTS * 2); + + kfifo_reset(&my_fifo); + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 0); +} + +static void kfifo_test_put_should_insert_and_get_should_pop(struct kunit *test) +{ + u8 out_data = 0; + int processed_elements; + u8 elements[] = { 3, 5, 11 }; + + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + // If the fifo is empty, get returns 0 + processed_elements = kfifo_get(&my_fifo, &out_data); + KUNIT_EXPECT_EQ(test, processed_elements, 0); + KUNIT_EXPECT_EQ(test, out_data, 0); + + for (int i = 0; i < 3; i++) + kfifo_put(&my_fifo, elements[i]); + + for (int i = 0; i < 3; i++) { + processed_elements = kfifo_get(&my_fifo, &out_data); + KUNIT_EXPECT_EQ(test, processed_elements, 1); + KUNIT_EXPECT_EQ(test, out_data, elements[i]); + } +} + +static void kfifo_test_in_should_insert_multiple_elements(struct kunit *test) +{ + u8 in_buffer[] = { 11, 25, 65 }; + u8 out_data; + int processed_elements; + + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + kfifo_in(&my_fifo, in_buffer, 3); + + for (int i = 0; i < 3; i++) { + processed_elements = kfifo_get(&my_fifo, &out_data); + KUNIT_EXPECT_EQ(test, processed_elements, 1); + KUNIT_EXPECT_EQ(test, out_data, in_buffer[i]); + } +} + +static void kfifo_test_out_should_pop_multiple_elements(struct kunit *test) +{ + u8 in_buffer[] = { 11, 25, 65 }; + u8 out_buffer[3]; + int copied_elements; + + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + for (int i = 0; i < 3; i++) + kfifo_put(&my_fifo, in_buffer[i]); + + copied_elements = kfifo_out(&my_fifo, out_buffer, 3); + KUNIT_EXPECT_EQ(test, copied_elements, 3); + + for (int i = 0; i < 3; i++) + KUNIT_EXPECT_EQ(test, out_buffer[i], in_buffer[i]); + KUNIT_EXPECT_TRUE(test, kfifo_is_empty(&my_fifo)); +} + +static void kfifo_test_dec_init_should_define_an_empty_fifo(struct kunit *test) +{ + DECLARE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + INIT_KFIFO(my_fifo); + + // my_fifo is a struct with an inplace buffer + KUNIT_EXPECT_FALSE(test, __is_kfifo_ptr(&my_fifo)); + + KUNIT_EXPECT_TRUE(test, kfifo_initialized(&my_fifo)); +} + +static void kfifo_test_define_should_equal_declare_init(struct kunit *test) +{ + // declare a variable my_fifo of type struct kfifo of u8 + DECLARE_KFIFO(my_fifo1, u8, KFIFO_SIZE); + // initialize the my_fifo variable + INIT_KFIFO(my_fifo1); + + // DEFINE_KFIFO declares the variable with the initial value + // essentially the same as calling DECLARE_KFIFO and INIT_KFIFO + DEFINE_KFIFO(my_fifo2, u8, KFIFO_SIZE); + + // my_fifo1 and my_fifo2 have the same size + KUNIT_EXPECT_EQ(test, sizeof(my_fifo1), sizeof(my_fifo2)); + KUNIT_EXPECT_EQ(test, kfifo_initialized(&my_fifo1), + kfifo_initialized(&my_fifo2)); + KUNIT_EXPECT_EQ(test, kfifo_is_empty(&my_fifo1), + kfifo_is_empty(&my_fifo2)); +} + +static void kfifo_test_alloc_should_initiliaze_a_ptr_fifo(struct kunit *test) +{ + int ret; + DECLARE_KFIFO_PTR(my_fifo, u8); + + INIT_KFIFO(my_fifo); + + // kfifo_initialized returns false signaling the buffer pointer is NULL + KUNIT_EXPECT_FALSE(test, kfifo_initialized(&my_fifo)); + + // kfifo_alloc allocates the buffer + ret = kfifo_alloc(&my_fifo, KFIFO_SIZE, GFP_KERNEL); + KUNIT_EXPECT_EQ_MSG(test, ret, 0, "Memory allocation should succeed"); + KUNIT_EXPECT_TRUE(test, kfifo_initialized(&my_fifo)); + + // kfifo_free frees the buffer + kfifo_free(&my_fifo); +} + +static void kfifo_test_peek_should_not_remove_elements(struct kunit *test) +{ + u8 out_data; + int processed_elements; + + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + // If the fifo is empty, peek returns 0 + processed_elements = kfifo_peek(&my_fifo, &out_data); + KUNIT_EXPECT_EQ(test, processed_elements, 0); + + kfifo_put(&my_fifo, 3); + kfifo_put(&my_fifo, 5); + kfifo_put(&my_fifo, 11); + + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 3); + + processed_elements = kfifo_peek(&my_fifo, &out_data); + KUNIT_EXPECT_EQ(test, processed_elements, 1); + KUNIT_EXPECT_EQ(test, out_data, 3); + + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 3); + + // Using peek doesn't remove the element + // so the read element and the fifo length + // remains the same + processed_elements = kfifo_peek(&my_fifo, &out_data); + KUNIT_EXPECT_EQ(test, processed_elements, 1); + KUNIT_EXPECT_EQ(test, out_data, 3); + + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 3); +} + +static struct kunit_case kfifo_test_cases[] = { + KUNIT_CASE(kfifo_test_reset_should_clear_the_fifo), + KUNIT_CASE(kfifo_test_define_should_define_an_empty_fifo), + KUNIT_CASE(kfifo_test_len_should_ret_n_of_stored_elements), + KUNIT_CASE(kfifo_test_put_should_insert_and_get_should_pop), + KUNIT_CASE(kfifo_test_in_should_insert_multiple_elements), + KUNIT_CASE(kfifo_test_out_should_pop_multiple_elements), + KUNIT_CASE(kfifo_test_dec_init_should_define_an_empty_fifo), + KUNIT_CASE(kfifo_test_define_should_equal_declare_init), + KUNIT_CASE(kfifo_test_alloc_should_initiliaze_a_ptr_fifo), + KUNIT_CASE(kfifo_test_peek_should_not_remove_elements), + {}, +}; + +static struct kunit_suite kfifo_test_module = { + .name = "kfifo", + .test_cases = kfifo_test_cases, +}; + +kunit_test_suites(&kfifo_test_module); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Diego Vieira "); +MODULE_DESCRIPTION("KUnit test for the kernel FIFO"); -- GitLab From 62b9ef504e7f89d6ae3e9ab704cc4befab1d37f0 Mon Sep 17 00:00:00 2001 From: Gabriela Bittencourt Date: Mon, 2 Dec 2024 15:55:42 +0800 Subject: [PATCH 210/934] unicode: kunit: refactor selftest to kunit tests Refactoring 'test' functions into kunit tests, to test utf-8 support in unicode subsystem. This allows the utf8 tests to be run alongside the KUnit test suite using kunit-tool, quickly compiling and running all desired tests as part of the KUnit test suite, instead of compiling the selftest module and loading it. The refactoring kept the original testing logic intact, while adopting a testing pattern across different kernel modules and leveraging KUnit's benefits. Co-developed-by: Pedro Orlando Signed-off-by: Pedro Orlando Co-developed-by: Danilo Pereira Signed-off-by: Danilo Pereira Signed-off-by: Gabriela Bittencourt Reviewed-by: David Gow Acked-by: Gabriel Krisman Bertazi Reviewed-by: Rae Moar Link: https://lore.kernel.org/r/20241202075545.3648096-6-davidgow@google.com Signed-off-by: Kees Cook --- fs/unicode/.kunitconfig | 3 + fs/unicode/Kconfig | 5 +- fs/unicode/Makefile | 2 +- fs/unicode/utf8-norm.c | 2 +- fs/unicode/utf8-selftest.c | 149 +++++++++++++++++-------------------- 5 files changed, 77 insertions(+), 84 deletions(-) create mode 100644 fs/unicode/.kunitconfig diff --git a/fs/unicode/.kunitconfig b/fs/unicode/.kunitconfig new file mode 100644 index 0000000000000..62dd5c171f9c8 --- /dev/null +++ b/fs/unicode/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_UNICODE=y +CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST=y diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig index da786a687fdc7..4ad2c36550f11 100644 --- a/fs/unicode/Kconfig +++ b/fs/unicode/Kconfig @@ -10,6 +10,7 @@ config UNICODE be a separate loadable module that gets requested only when a file system actually use it. -config UNICODE_NORMALIZATION_SELFTEST +config UNICODE_NORMALIZATION_KUNIT_TEST tristate "Test UTF-8 normalization support" - depends on UNICODE + depends on UNICODE && KUNIT + default KUNIT_ALL_TESTS diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index e309afe2b2bb7..37bbcbc628a13 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -4,7 +4,7 @@ ifneq ($(CONFIG_UNICODE),) obj-y += unicode.o endif obj-$(CONFIG_UNICODE) += utf8data.o -obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o +obj-$(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) += utf8-selftest.o unicode-y := utf8-norm.o utf8-core.o diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c index 768f8ab448b8f..7b998c99c88df 100644 --- a/fs/unicode/utf8-norm.c +++ b/fs/unicode/utf8-norm.c @@ -586,7 +586,7 @@ ccc_mismatch: } } -#ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE +#if IS_MODULE(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) EXPORT_SYMBOL_GPL(utf8version_is_supported); EXPORT_SYMBOL_GPL(utf8nlen); EXPORT_SYMBOL_GPL(utf8ncursor); diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c index 5ddaf27b21a65..9476ab012baa1 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/utf8-selftest.c @@ -1,35 +1,15 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Kernel module for testing utf-8 support. + * KUnit tests for utf-8 support. * * Copyright 2017 Collabora Ltd. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include #include -#include +#include #include "utf8n.h" -static unsigned int failed_tests; -static unsigned int total_tests; - -#define _test(cond, func, line, fmt, ...) do { \ - total_tests++; \ - if (!cond) { \ - failed_tests++; \ - pr_err("test %s:%d Failed: %s%s", \ - func, line, #cond, (fmt?":":".")); \ - if (fmt) \ - pr_err(fmt, ##__VA_ARGS__); \ - } \ - } while (0) -#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) -#define test(cond) _test(cond, __func__, __LINE__, "") - static const struct { /* UTF-8 strings in this vector _must_ be NULL-terminated. */ unsigned char str[10]; @@ -167,69 +147,74 @@ static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um, return utf8ncursor(u8c, um, n, s, (unsigned int)-1); } -static void check_utf8_nfdi(struct unicode_map *um) +static void check_utf8_nfdi(struct kunit *test) { int i; struct utf8cursor u8c; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { int len = strlen(nfdi_test_data[i].str); int nlen = strlen(nfdi_test_data[i].dec); int j = 0; unsigned char c; + int ret; + + KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDI, nfdi_test_data[i].str), nlen); + KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len), + nlen); - test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen)); - test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) == - nlen)); - if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0) - pr_err("can't create cursor\n"); + ret = utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str); + KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { - test_f((c == nfdi_test_data[i].dec[j]), - "Unexpected byte 0x%x should be 0x%x\n", - c, nfdi_test_data[i].dec[j]); + KUNIT_EXPECT_EQ_MSG(test, c, nfdi_test_data[i].dec[j], + "Unexpected byte 0x%x should be 0x%x\n", + c, nfdi_test_data[i].dec[j]); j++; } - test((j == nlen)); + KUNIT_EXPECT_EQ(test, j, nlen); } } -static void check_utf8_nfdicf(struct unicode_map *um) +static void check_utf8_nfdicf(struct kunit *test) { int i; struct utf8cursor u8c; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { int len = strlen(nfdicf_test_data[i].str); int nlen = strlen(nfdicf_test_data[i].ncf); int j = 0; + int ret; unsigned char c; - test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) == - nlen)); - test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) == - nlen)); + KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str), + nlen); + KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len), + nlen); - if (utf8cursor(&u8c, um, UTF8_NFDICF, - nfdicf_test_data[i].str) < 0) - pr_err("can't create cursor\n"); + ret = utf8cursor(&u8c, um, UTF8_NFDICF, nfdicf_test_data[i].str); + KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { - test_f((c == nfdicf_test_data[i].ncf[j]), - "Unexpected byte 0x%x should be 0x%x\n", - c, nfdicf_test_data[i].ncf[j]); + KUNIT_EXPECT_EQ_MSG(test, c, nfdicf_test_data[i].ncf[j], + "Unexpected byte 0x%x should be 0x%x\n", + c, nfdicf_test_data[i].ncf[j]); j++; } - test((j == nlen)); + KUNIT_EXPECT_EQ(test, j, nlen); } } -static void check_utf8_comparisons(struct unicode_map *table) +static void check_utf8_comparisons(struct kunit *test) { int i; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { const struct qstr s1 = {.name = nfdi_test_data[i].str, @@ -237,8 +222,9 @@ static void check_utf8_comparisons(struct unicode_map *table) const struct qstr s2 = {.name = nfdi_test_data[i].dec, .len = sizeof(nfdi_test_data[i].dec)}; - test_f(!utf8_strncmp(table, &s1, &s2), - "%s %s comparison mismatch\n", s1.name, s2.name); + /* strncmp returns 0 when strings are equal */ + KUNIT_EXPECT_TRUE_MSG(test, utf8_strncmp(um, &s1, &s2) == 0, + "%s %s comparison mismatch\n", s1.name, s2.name); } for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { @@ -247,62 +233,65 @@ static void check_utf8_comparisons(struct unicode_map *table) const struct qstr s2 = {.name = nfdicf_test_data[i].ncf, .len = sizeof(nfdicf_test_data[i].ncf)}; - test_f(!utf8_strncasecmp(table, &s1, &s2), - "%s %s comparison mismatch\n", s1.name, s2.name); + /* strncasecmp returns 0 when strings are equal */ + KUNIT_EXPECT_TRUE_MSG(test, utf8_strncasecmp(um, &s1, &s2) == 0, + "%s %s comparison mismatch\n", s1.name, s2.name); } } -static void check_supported_versions(struct unicode_map *um) +static void check_supported_versions(struct kunit *test) { + struct unicode_map *um = test->priv; /* Unicode 7.0.0 should be supported. */ - test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); /* Unicode 9.0.0 should be supported. */ - test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); /* Unicode 1x.0.0 (the latest version) should be supported. */ - test(utf8version_is_supported(um, UTF8_LATEST)); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UTF8_LATEST)); /* Next versions don't exist. */ - test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); - test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); - test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); } -static int __init init_test_ucd(void) +static struct kunit_case unicode_normalization_test_cases[] = { + KUNIT_CASE(check_supported_versions), + KUNIT_CASE(check_utf8_comparisons), + KUNIT_CASE(check_utf8_nfdicf), + KUNIT_CASE(check_utf8_nfdi), + {} +}; + +static int init_test_ucd(struct kunit *test) { - struct unicode_map *um; + struct unicode_map *um = utf8_load(UTF8_LATEST); - failed_tests = 0; - total_tests = 0; + test->priv = um; - um = utf8_load(UTF8_LATEST); - if (IS_ERR(um)) { - pr_err("%s: Unable to load utf8 table.\n", __func__); - return PTR_ERR(um); - } + KUNIT_EXPECT_EQ_MSG(test, IS_ERR(um), 0, + "%s: Unable to load utf8 table.\n", __func__); - check_supported_versions(um); - check_utf8_nfdi(um); - check_utf8_nfdicf(um); - check_utf8_comparisons(um); - - if (!failed_tests) - pr_info("All %u tests passed\n", total_tests); - else - pr_err("%u out of %u tests failed\n", failed_tests, - total_tests); - utf8_unload(um); return 0; } -static void __exit exit_test_ucd(void) +static void exit_test_ucd(struct kunit *test) { + utf8_unload(test->priv); } -module_init(init_test_ucd); -module_exit(exit_test_ucd); +static struct kunit_suite unicode_normalization_test_suite = { + .name = "unicode_normalization", + .test_cases = unicode_normalization_test_cases, + .init = init_test_ucd, + .exit = exit_test_ucd, +}; + +kunit_test_suite(unicode_normalization_test_suite); + MODULE_AUTHOR("Gabriel Krisman Bertazi "); -MODULE_DESCRIPTION("Kernel module for testing utf-8 support"); +MODULE_DESCRIPTION("KUnit tests for utf-8 support."); MODULE_LICENSE("GPL"); -- GitLab From 0ff053b98a0f039e52c2bd8d0cb38f2831edfaf5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Feb 2025 13:38:59 +0100 Subject: [PATCH 211/934] fs: support O_PATH fds with FSCONFIG_SET_FD Let FSCONFIG_SET_FD handle O_PATH file descriptors. This is particularly useful in the context of overlayfs where layers can be specified via file descriptors instead of paths. But userspace must currently use non-O_PATH file desriptors which is often pointless especially if the file descriptors have been created via open_tree(OPEN_TREE_CLONE). Link: https://lore.kernel.org/r/20250210-work-overlayfs-v2-1-ed2a949b674b@kernel.org Fixes: a08557d19ef41 ("ovl: specify layers via file descriptors") Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/autofs/autofs_i.h | 2 ++ fs/fsopen.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 77c7991d89aac..23cea74f9933b 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -218,6 +218,8 @@ void autofs_clean_ino(struct autofs_info *); static inline int autofs_check_pipe(struct file *pipe) { + if (pipe->f_mode & FMODE_PATH) + return -EINVAL; if (!(pipe->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!S_ISFIFO(file_inode(pipe)->i_mode)) diff --git a/fs/fsopen.c b/fs/fsopen.c index 094a7f510edfe..1aaf4cb2afb29 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -453,7 +453,7 @@ SYSCALL_DEFINE5(fsconfig, case FSCONFIG_SET_FD: param.type = fs_value_is_file; ret = -EBADF; - param.file = fget(aux); + param.file = fget_raw(aux); if (!param.file) goto out_key; param.dirfd = aux; -- GitLab From 85c8700cb6e67cac228bfb313fa8e33d1750410f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Feb 2025 13:39:00 +0100 Subject: [PATCH 212/934] selftests/overlayfs: test specifying layers as O_PATH file descriptors Verify that userspace can specify layers via O_PATH file descriptors. Link: https://lore.kernel.org/r/20250210-work-overlayfs-v2-2-ed2a949b674b@kernel.org Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- .../overlayfs/set_layers_via_fds.c | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c index 1d0ae785a6677..e693e4102d22d 100644 --- a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c +++ b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c @@ -214,4 +214,69 @@ TEST_F(set_layers_via_fds, set_500_layers_via_fds) ASSERT_EQ(close(fd_overlay), 0); } +TEST_F(set_layers_via_fds, set_500_layers_via_opath_fds) +{ + int fd_context, fd_tmpfs, fd_overlay, fd_work, fd_upper, fd_lower; + int layer_fds[500] = { [0 ... 499] = -EBADF }; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) { + char path[100]; + + sprintf(path, "l%d", i); + ASSERT_EQ(mkdirat(fd_tmpfs, path, 0755), 0); + layer_fds[i] = openat(fd_tmpfs, path, O_DIRECTORY | O_PATH); + ASSERT_GE(layer_fds[i], 0); + } + + ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0); + fd_work = openat(fd_tmpfs, "w", O_DIRECTORY | O_PATH); + ASSERT_GE(fd_work, 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0); + fd_upper = openat(fd_tmpfs, "u", O_DIRECTORY | O_PATH); + ASSERT_GE(fd_upper, 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "l501", 0755), 0); + fd_lower = openat(fd_tmpfs, "l501", O_DIRECTORY | O_PATH); + ASSERT_GE(fd_lower, 0); + + ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(close(fd_tmpfs), 0); + + fd_context = sys_fsopen("overlay", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, fd_work), 0); + ASSERT_EQ(close(fd_work), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, fd_upper), 0); + ASSERT_EQ(close(fd_upper), 0); + + for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) { + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[i]), 0); + ASSERT_EQ(close(layer_fds[i]), 0); + } + + ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, fd_lower), 0); + ASSERT_EQ(close(fd_lower), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_overlay = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_overlay, 0); + ASSERT_EQ(close(fd_context), 0); + ASSERT_EQ(close(fd_overlay), 0); +} + TEST_HARNESS_MAIN -- GitLab From db04662e2f4fced10b93032c65ff03caaae21053 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 23 Jan 2025 20:19:48 +0100 Subject: [PATCH 213/934] fs: allow detached mounts in clone_private_mount() In container workloads idmapped mounts are often used as layers for overlayfs. Recently I added the ability to specify layers in overlayfs as file descriptors instead of path names. It should be possible to simply use the detached mounts directly when specifying layers instead of having to attach them beforehand. They are discarded after overlayfs is mounted anyway so it's pointless system calls for userspace and pointless locking for the kernel. This just recently come up again in [1]. So enable clone_private_mount() to use detached mounts directly. Following conditions must be met: - Provided path must be the root of a detached mount tree. - Provided path may not create mount namespace loops. - Provided path must be mounted. It would be possible to be stricter and require that the caller must have CAP_SYS_ADMIN in the owning user namespace of the anonymous mount namespace but since this restriction isn't enforced for move_mount() there's no point in enforcing it for clone_private_mount(). This contains a folded fix for: Reported-by: syzbot+62dfea789a2cedac1298@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=62dfea789a2cedac1298 provided by Lizhi Xu in [2]. Link: https://lore.kernel.org/r/20250207071331.550952-1-lizhi.xu@windriver.com [2] Link: https://lore.kernel.org/r/fd8f6574-f737-4743-b220-79c815ee1554@mbaynton.com [1] Link: https://lore.kernel.org/r/20250123-avancieren-erfreuen-3d61f6588fdd@brauner Tested-by: Mike Baynton Signed-off-by: Christian Brauner --- fs/namespace.c | 78 ++++++++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 05f6680e2251f..d470a369d42bc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2369,6 +2369,28 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) return false; } +/* + * Check that there aren't references to earlier/same mount namespaces in the + * specified subtree. Such references can act as pins for mount namespaces + * that aren't checked by the mount-cycle checking code, thereby allowing + * cycles to be made. + */ +static bool check_for_nsfs_mounts(struct mount *subtree) +{ + struct mount *p; + bool ret = false; + + lock_mount_hash(); + for (p = subtree; p; p = next_mnt(p, subtree)) + if (mnt_ns_loop(p->mnt.mnt_root)) + goto out; + + ret = true; +out: + unlock_mount_hash(); + return ret; +} + /** * clone_private_mount - create a private clone of a path * @path: path to clone @@ -2377,6 +2399,8 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) * will not be attached anywhere in the namespace and will be private (i.e. * changes to the originating mount won't be propagated into this). * + * This assumes caller has called or done the equivalent of may_mount(). + * * Release with mntput(). */ struct vfsmount *clone_private_mount(const struct path *path) @@ -2384,30 +2408,36 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; - down_read(&namespace_sem); + scoped_guard(rwsem_read, &namespace_sem) if (IS_MNT_UNBINDABLE(old_mnt)) - goto invalid; + return ERR_PTR(-EINVAL); + + if (mnt_has_parent(old_mnt)) { + if (!check_mnt(old_mnt)) + return ERR_PTR(-EINVAL); + } else { + if (!is_mounted(&old_mnt->mnt)) + return ERR_PTR(-EINVAL); - if (!check_mnt(old_mnt)) - goto invalid; + /* Make sure this isn't something purely kernel internal. */ + if (!is_anon_ns(old_mnt->mnt_ns)) + return ERR_PTR(-EINVAL); + + /* Make sure we don't create mount namespace loops. */ + if (!check_for_nsfs_mounts(old_mnt)) + return ERR_PTR(-EINVAL); + } if (has_locked_children(old_mnt, path->dentry)) - goto invalid; + return ERR_PTR(-EINVAL); new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); - up_read(&namespace_sem); - if (IS_ERR(new_mnt)) - return ERR_CAST(new_mnt); + return ERR_PTR(-EINVAL); /* Longterm mount to be removed by kern_unmount*() */ new_mnt->mnt_ns = MNT_NS_INTERNAL; - return &new_mnt->mnt; - -invalid: - up_read(&namespace_sem); - return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(clone_private_mount); @@ -3206,28 +3236,6 @@ static inline int tree_contains_unbindable(struct mount *mnt) return 0; } -/* - * Check that there aren't references to earlier/same mount namespaces in the - * specified subtree. Such references can act as pins for mount namespaces - * that aren't checked by the mount-cycle checking code, thereby allowing - * cycles to be made. - */ -static bool check_for_nsfs_mounts(struct mount *subtree) -{ - struct mount *p; - bool ret = false; - - lock_mount_hash(); - for (p = subtree; p; p = next_mnt(p, subtree)) - if (mnt_ns_loop(p->mnt.mnt_root)) - goto out; - - ret = true; -out: - unlock_mount_hash(); - return ret; -} - static int do_set_group(struct path *from_path, struct path *to_path) { struct mount *from, *to; -- GitLab From 784ed4354c9077501b3a9236bafea23e5b878703 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Feb 2025 12:27:46 +0100 Subject: [PATCH 214/934] uidgid: add map_id_range_up() Add map_id_range_up() to verify that the full kernel id range can be mapped up in a given idmapping. This will be used in follow-up patches. Link: https://lore.kernel.org/r/20250204-work-mnt_idmap-statmount-v2-1-007720f39f2e@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/uidgid.h | 6 ++++++ kernel/user_namespace.c | 26 +++++++++++++++++--------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index f85ec5613721f..2dc767e08f544 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -132,6 +132,7 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) u32 map_id_down(struct uid_gid_map *map, u32 id); u32 map_id_up(struct uid_gid_map *map, u32 id); +u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count); #else @@ -186,6 +187,11 @@ static inline u32 map_id_down(struct uid_gid_map *map, u32 id) return id; } +static inline u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) +{ + return id; +} + static inline u32 map_id_up(struct uid_gid_map *map, u32 id) { return id; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index aa0b2e47f2f21..682f40d5632d4 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -238,7 +238,7 @@ EXPORT_SYMBOL(__put_user_ns); struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ - u32 count; /* == 0 unless used with map_id_range_down() */ + u32 count; }; /* @@ -343,16 +343,19 @@ u32 map_id_down(struct uid_gid_map *map, u32 id) * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * -map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) +map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; - u32 first, last; + u32 first, last, id2; + + id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; - if (id >= first && id <= last) + if (id >= first && id <= last && + (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; @@ -363,28 +366,28 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * -map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) +map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = true; - key.count = 1; + key.count = count; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } -u32 map_id_up(struct uid_gid_map *map, u32 id) +u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - extent = map_id_up_base(extents, map, id); + extent = map_id_range_up_base(extents, map, id, count); else - extent = map_id_up_max(extents, map, id); + extent = map_id_range_up_max(extents, map, id, count); /* Map the id or note failure */ if (extent) @@ -395,6 +398,11 @@ u32 map_id_up(struct uid_gid_map *map, u32 id) return id; } +u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + return map_id_range_up(map, id, 1); +} + /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in -- GitLab From 37c4a9590e1efcae7749682239fc22a330d2d325 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Feb 2025 12:27:47 +0100 Subject: [PATCH 215/934] statmount: allow to retrieve idmappings This adds the STATMOUNT_MNT_UIDMAP and STATMOUNT_MNT_GIDMAP options. It allows the retrieval of idmappings via statmount(). Currently it isn't possible to figure out what idmappings are applied to an idmapped mount. This information is often crucial. Before statmount() the only realistic options for an interface like this would have been to add it to /proc//fdinfo/ or to expose it in /proc//mountinfo. Both solution would have been pretty ugly and would've shown information that is of strong interest to some application but not all. statmount() is perfect for this. The idmappings applied to an idmapped mount are shown relative to the caller's user namespace. This is the most useful solution that doesn't risk leaking information or confuse the caller. For example, an idmapped mount might have been created with the following idmappings: mount --bind -o X-mount.idmap="0:10000:1000 2000:2000:1 3000:3000:1" /srv /opt Listing the idmappings through statmount() in the same context shows: mnt_id: 2147485088 mnt_parent_id: 2147484816 fs_type: btrfs mnt_root: /srv mnt_point: /opt mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/ mnt_uidmap[0]: 0 10000 1000 mnt_uidmap[1]: 2000 2000 1 mnt_uidmap[2]: 3000 3000 1 mnt_gidmap[0]: 0 10000 1000 mnt_gidmap[1]: 2000 2000 1 mnt_gidmap[2]: 3000 3000 1 But the idmappings might not always be resolvable in the caller's user namespace. For example: unshare --user --map-root In this case statmount() will skip any mappings that fil to resolve in the caller's idmapping: mnt_id: 2147485087 mnt_parent_id: 2147484016 fs_type: btrfs mnt_root: /srv mnt_point: /opt mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/ The caller can differentiate between a mount not being idmapped and a mount that is idmapped but where all mappings fail to resolve in the caller's idmapping by check for the STATMOUNT_MNT_{G,U}IDMAP flag being raised but the number of mappings in ->mnt_{g,u}idmap_num being zero. Note that statmount() requires that the whole range must be resolvable in the caller's user namespace. If a subrange fails to map it will still list the map as not resolvable. This is a practical compromise to avoid having to find which subranges are resovable and wich aren't. Idmappings are listed as a string array with each mapping separated by zero bytes. This allows to retrieve the idmappings and immediately use them for writing to e.g., /proc//{g,u}id_map and it also allow for simple iteration like: if (stmnt->mask & STATMOUNT_MNT_UIDMAP) { const char *idmap = stmnt->str + stmnt->mnt_uidmap; for (size_t idx = 0; idx < stmnt->mnt_uidmap_nr; idx++) { printf("mnt_uidmap[%lu]: %s\n", idx, idmap); idmap += strlen(idmap) + 1; } } Link: https://lore.kernel.org/r/20250204-work-mnt_idmap-statmount-v2-2-007720f39f2e@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/internal.h | 1 + fs/mnt_idmapping.c | 51 ++++++++++++++++++++++++++++++ fs/namespace.c | 59 ++++++++++++++++++++++++++++++++++- include/linux/mnt_idmapping.h | 5 +++ include/uapi/linux/mount.h | 8 ++++- 5 files changed, 122 insertions(+), 2 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index e7f02ae1e0981..db6094d5cb0b6 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -338,3 +338,4 @@ static inline bool path_mounted(const struct path *path) return path->mnt->mnt_root == path->dentry; } void file_f_owner_release(struct file *file); +int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map); diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 7b1df8cc2821d..a37991fdb194d 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "internal.h" @@ -334,3 +335,53 @@ void mnt_idmap_put(struct mnt_idmap *idmap) free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); + +int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map) +{ + struct uid_gid_map *map, *map_up; + u32 idx, nr_mappings; + + if (!is_valid_mnt_idmap(idmap)) + return 0; + + /* + * Idmappings are shown relative to the caller's idmapping. + * This is both the most intuitive and most useful solution. + */ + if (uid_map) { + map = &idmap->uid_map; + map_up = ¤t_user_ns()->uid_map; + } else { + map = &idmap->gid_map; + map_up = ¤t_user_ns()->gid_map; + } + + for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) { + uid_t lower; + struct uid_gid_extent *extent; + + if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + extent = &map->extent[idx]; + else + extent = &map->forward[idx]; + + /* + * Verify that the whole range of the mapping can be + * resolved in the caller's idmapping. If it cannot be + * resolved skip the mapping. + */ + lower = map_id_range_up(map_up, extent->lower_first, extent->count); + if (lower == (uid_t) -1) + continue; + + seq_printf(seq, "%u %u %u", extent->first, lower, extent->count); + + seq->count++; /* mappings are separated by \0 */ + if (seq_has_overflowed(seq)) + return -EAGAIN; + + nr_mappings++; + } + + return nr_mappings; +} diff --git a/fs/namespace.c b/fs/namespace.c index d470a369d42bc..967ee2a5bacf3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5008,6 +5008,7 @@ struct kstatmount { struct statmount __user *buf; size_t bufsize; struct vfsmount *mnt; + struct mnt_idmap *idmap; u64 mask; struct path root; struct statmount sm; @@ -5278,6 +5279,46 @@ static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) return 0; } +static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq) +{ + int ret; + + ret = statmount_mnt_idmap(s->idmap, seq, true); + if (ret < 0) + return ret; + + s->sm.mnt_uidmap_num = ret; + /* + * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid + * mappings. This allows userspace to distinguish between a + * non-idmapped mount and an idmapped mount where none of the + * individual mappings are valid in the caller's idmapping. + */ + if (is_valid_mnt_idmap(s->idmap)) + s->sm.mask |= STATMOUNT_MNT_UIDMAP; + return 0; +} + +static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq) +{ + int ret; + + ret = statmount_mnt_idmap(s->idmap, seq, false); + if (ret < 0) + return ret; + + s->sm.mnt_gidmap_num = ret; + /* + * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid + * mappings. This allows userspace to distinguish between a + * non-idmapped mount and an idmapped mount where none of the + * individual mappings are valid in the caller's idmapping. + */ + if (is_valid_mnt_idmap(s->idmap)) + s->sm.mask |= STATMOUNT_MNT_GIDMAP; + return 0; +} + static int statmount_string(struct kstatmount *s, u64 flag) { int ret = 0; @@ -5319,6 +5360,14 @@ static int statmount_string(struct kstatmount *s, u64 flag) sm->sb_source = start; ret = statmount_sb_source(s, seq); break; + case STATMOUNT_MNT_UIDMAP: + sm->mnt_uidmap = start; + ret = statmount_mnt_uidmap(s, seq); + break; + case STATMOUNT_MNT_GIDMAP: + sm->mnt_gidmap = start; + ret = statmount_mnt_gidmap(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; @@ -5443,6 +5492,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, return err; s->root = root; + s->idmap = mnt_idmap(s->mnt); if (s->mask & STATMOUNT_SB_BASIC) statmount_sb_basic(s); @@ -5476,6 +5526,12 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_SB_SOURCE) err = statmount_string(s, STATMOUNT_SB_SOURCE); + if (!err && s->mask & STATMOUNT_MNT_UIDMAP) + err = statmount_string(s, STATMOUNT_MNT_UIDMAP); + + if (!err && s->mask & STATMOUNT_MNT_GIDMAP) + err = statmount_string(s, STATMOUNT_MNT_GIDMAP); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); @@ -5499,7 +5555,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ - STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY) + STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \ + STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index b1b219bc34224..e71a6070a8f8c 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -25,6 +25,11 @@ static_assert(sizeof(vfsgid_t) == sizeof(kgid_t)); static_assert(offsetof(vfsuid_t, val) == offsetof(kuid_t, val)); static_assert(offsetof(vfsgid_t, val) == offsetof(kgid_t, val)); +static inline bool is_valid_mnt_idmap(const struct mnt_idmap *idmap) +{ + return idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap; +} + #ifdef CONFIG_MULTIUSER static inline uid_t __vfsuid_val(vfsuid_t uid) { diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index c07008816acae..0be6ac4c16243 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -179,7 +179,11 @@ struct statmount { __u32 opt_array; /* [str] Array of nul terminated fs options */ __u32 opt_sec_num; /* Number of security options */ __u32 opt_sec_array; /* [str] Array of nul terminated security options */ - __u64 __spare2[46]; + __u32 mnt_uidmap_num; /* Number of uid mappings */ + __u32 mnt_uidmap; /* [str] Array of uid mappings (as seen from callers namespace) */ + __u32 mnt_gidmap_num; /* Number of gid mappings */ + __u32 mnt_gidmap; /* [str] Array of gid mappings (as seen from callers namespace) */ + __u64 __spare2[44]; char str[]; /* Variable size part containing strings */ }; @@ -217,6 +221,8 @@ struct mnt_id_req { #define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */ #define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */ #define STATMOUNT_OPT_SEC_ARRAY 0x00000800U /* Want/got opt_sec... */ +#define STATMOUNT_MNT_UIDMAP 0x00001000U /* Want/got uidmap... */ +#define STATMOUNT_MNT_GIDMAP 0x00002000U /* Want/got gidmap... */ /* * Special @mnt_id values that can be passed to listmount -- GitLab From a496dfecbc47253012f1e10d01110428bb9dc118 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Feb 2025 12:27:48 +0100 Subject: [PATCH 216/934] samples/vfs: check whether flag was raised For string options the kernel will raise the corresponding flag only if the string isn't empty. So check for the flag. Link: https://lore.kernel.org/r/20250204-work-mnt_idmap-statmount-v2-3-007720f39f2e@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- samples/vfs/test-list-all-mounts.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/samples/vfs/test-list-all-mounts.c b/samples/vfs/test-list-all-mounts.c index 1a02ea4593e39..ce272ded8a79a 100644 --- a/samples/vfs/test-list-all-mounts.c +++ b/samples/vfs/test-list-all-mounts.c @@ -138,10 +138,11 @@ next: printf("mnt_id:\t\t%" PRIu64 "\nmnt_parent_id:\t%" PRIu64 "\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n\n", (uint64_t)stmnt->mnt_id, (uint64_t)stmnt->mnt_parent_id, - stmnt->str + stmnt->fs_type, - stmnt->str + stmnt->mnt_root, - stmnt->str + stmnt->mnt_point, - stmnt->str + stmnt->mnt_opts); + (stmnt->mask & STATMOUNT_FS_TYPE) ? stmnt->str + stmnt->fs_type : "", + (stmnt->mask & STATMOUNT_MNT_ROOT) ? stmnt->str + stmnt->mnt_root : "", + (stmnt->mask & STATMOUNT_MNT_POINT) ? stmnt->str + stmnt->mnt_point : "", + (stmnt->mask & STATMOUNT_MNT_OPTS) ? stmnt->str + stmnt->mnt_opts : ""); + free(stmnt); } } -- GitLab From ccc829b15d48a450b186cab097f4f5d01df445d4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 23 Jan 2025 20:19:49 +0100 Subject: [PATCH 217/934] selftests: add tests for using detached mount with overlayfs Test that it is possible to use detached mounts as overlayfs layers. Link: https://lore.kernel.org/r/20250123-erstbesteigung-angeeignet-1d30e64b7df2@brauner Signed-off-by: Christian Brauner --- .../overlayfs/set_layers_via_fds.c | 130 ++++++++++++++++++ .../filesystems/overlayfs/wrappers.h | 17 +++ 2 files changed, 147 insertions(+) diff --git a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c index e693e4102d22d..e65d95d97846d 100644 --- a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c +++ b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c @@ -20,12 +20,16 @@ FIXTURE(set_layers_via_fds) { FIXTURE_SETUP(set_layers_via_fds) { ASSERT_EQ(mkdir("/set_layers_via_fds", 0755), 0); + ASSERT_EQ(mkdir("/set_layers_via_fds_tmpfs", 0755), 0); } FIXTURE_TEARDOWN(set_layers_via_fds) { umount2("/set_layers_via_fds", 0); ASSERT_EQ(rmdir("/set_layers_via_fds"), 0); + + umount2("/set_layers_via_fds_tmpfs", 0); + ASSERT_EQ(rmdir("/set_layers_via_fds_tmpfs"), 0); } TEST_F(set_layers_via_fds, set_layers_via_fds) @@ -279,4 +283,130 @@ TEST_F(set_layers_via_fds, set_500_layers_via_opath_fds) ASSERT_EQ(close(fd_overlay), 0); } +TEST_F(set_layers_via_fds, set_layers_via_detached_mount_fds) +{ + int fd_context, fd_tmpfs, fd_overlay, fd_tmp; + int layer_fds[] = { [0 ... 8] = -EBADF }; + bool layers_found[] = { [0 ... 8] = false }; + size_t len = 0; + char *line = NULL; + FILE *f_mountinfo; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "u/upper", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "u/work", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l3", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l4", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "d1", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "d2", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "d3", 0755), 0); + + ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/set_layers_via_fds_tmpfs", MOVE_MOUNT_F_EMPTY_PATH), 0); + + fd_tmp = open_tree(fd_tmpfs, "u", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(fd_tmp, 0); + + layer_fds[0] = openat(fd_tmp, "upper", O_CLOEXEC | O_DIRECTORY | O_PATH); + ASSERT_GE(layer_fds[0], 0); + + layer_fds[1] = openat(fd_tmp, "work", O_CLOEXEC | O_DIRECTORY | O_PATH); + ASSERT_GE(layer_fds[1], 0); + + layer_fds[2] = open_tree(fd_tmpfs, "l1", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[2], 0); + + layer_fds[3] = open_tree(fd_tmpfs, "l2", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[3], 0); + + layer_fds[4] = open_tree(fd_tmpfs, "l3", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[4], 0); + + layer_fds[5] = open_tree(fd_tmpfs, "l4", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[5], 0); + + layer_fds[6] = open_tree(fd_tmpfs, "d1", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[6], 0); + + layer_fds[7] = open_tree(fd_tmpfs, "d2", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[7], 0); + + layer_fds[8] = open_tree(fd_tmpfs, "d3", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[8], 0); + + ASSERT_EQ(close(fd_tmpfs), 0); + + fd_context = sys_fsopen("overlay", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[0]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[1]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[4]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[5]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[6]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[7]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[8]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_overlay = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_overlay, 0); + + ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0); + + f_mountinfo = fopen("/proc/self/mountinfo", "r"); + ASSERT_NE(f_mountinfo, NULL); + + while (getline(&line, &len, f_mountinfo) != -1) { + char *haystack = line; + + if (strstr(haystack, "workdir=/tmp/w")) + layers_found[0] = true; + if (strstr(haystack, "upperdir=/tmp/u")) + layers_found[1] = true; + if (strstr(haystack, "lowerdir+=/tmp/l1")) + layers_found[2] = true; + if (strstr(haystack, "lowerdir+=/tmp/l2")) + layers_found[3] = true; + if (strstr(haystack, "lowerdir+=/tmp/l3")) + layers_found[4] = true; + if (strstr(haystack, "lowerdir+=/tmp/l4")) + layers_found[5] = true; + if (strstr(haystack, "datadir+=/tmp/d1")) + layers_found[6] = true; + if (strstr(haystack, "datadir+=/tmp/d2")) + layers_found[7] = true; + if (strstr(haystack, "datadir+=/tmp/d3")) + layers_found[8] = true; + } + free(line); + + for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) { + ASSERT_EQ(layers_found[i], true); + ASSERT_EQ(close(layer_fds[i]), 0); + } + + ASSERT_EQ(close(fd_context), 0); + ASSERT_EQ(close(fd_overlay), 0); + ASSERT_EQ(fclose(f_mountinfo), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/overlayfs/wrappers.h b/tools/testing/selftests/filesystems/overlayfs/wrappers.h index 071b95fd2ac0a..c38bc48e0cfaf 100644 --- a/tools/testing/selftests/filesystems/overlayfs/wrappers.h +++ b/tools/testing/selftests/filesystems/overlayfs/wrappers.h @@ -44,4 +44,21 @@ static inline int sys_move_mount(int from_dfd, const char *from_pathname, to_pathname, flags); } +#ifndef OPEN_TREE_CLONE +#define OPEN_TREE_CLONE 1 +#endif + +#ifndef OPEN_TREE_CLOEXEC +#define OPEN_TREE_CLOEXEC O_CLOEXEC +#endif + +#ifndef AT_RECURSIVE +#define AT_RECURSIVE 0x8000 +#endif + +static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags) +{ + return syscall(__NR_open_tree, dfd, filename, flags); +} + #endif -- GitLab From fa204a65f1b65664eb938940c73cdfc0dcfd578e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Feb 2025 12:27:49 +0100 Subject: [PATCH 218/934] samples/vfs: add STATMOUNT_MNT_{G,U}IDMAP Illustrate how to use STATMOUNT_MNT_{G,U}IDMAP. Link: https://lore.kernel.org/r/20250204-work-mnt_idmap-statmount-v2-4-007720f39f2e@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- samples/vfs/samples-vfs.h | 14 +++++++++++++- samples/vfs/test-list-all-mounts.c | 26 ++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/samples/vfs/samples-vfs.h b/samples/vfs/samples-vfs.h index 103e1e7c4cec2..498baf581b563 100644 --- a/samples/vfs/samples-vfs.h +++ b/samples/vfs/samples-vfs.h @@ -42,7 +42,11 @@ struct statmount { __u32 opt_array; /* [str] Array of nul terminated fs options */ __u32 opt_sec_num; /* Number of security options */ __u32 opt_sec_array; /* [str] Array of nul terminated security options */ - __u64 __spare2[46]; + __u32 mnt_uidmap_num; /* Number of uid mappings */ + __u32 mnt_uidmap; /* [str] Array of uid mappings */ + __u32 mnt_gidmap_num; /* Number of gid mappings */ + __u32 mnt_gidmap; /* [str] Array of gid mappings */ + __u64 __spare2[44]; char str[]; /* Variable size part containing strings */ }; @@ -158,6 +162,14 @@ struct mnt_ns_info { #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #endif +#ifndef STATMOUNT_MNT_UIDMAP +#define STATMOUNT_MNT_UIDMAP 0x00002000U /* Want/got uidmap... */ +#endif + +#ifndef STATMOUNT_MNT_GIDMAP +#define STATMOUNT_MNT_GIDMAP 0x00004000U /* Want/got gidmap... */ +#endif + #ifndef MOUNT_ATTR_RDONLY #define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */ #endif diff --git a/samples/vfs/test-list-all-mounts.c b/samples/vfs/test-list-all-mounts.c index ce272ded8a79a..bb3b83d8f1d7c 100644 --- a/samples/vfs/test-list-all-mounts.c +++ b/samples/vfs/test-list-all-mounts.c @@ -128,14 +128,16 @@ next: STATMOUNT_MNT_POINT | STATMOUNT_MNT_NS_ID | STATMOUNT_MNT_OPTS | - STATMOUNT_FS_TYPE, 0); + STATMOUNT_FS_TYPE | + STATMOUNT_MNT_UIDMAP | + STATMOUNT_MNT_GIDMAP, 0); if (!stmnt) { printf("Failed to statmount(%" PRIu64 ") in mount namespace(%" PRIu64 ")\n", (uint64_t)last_mnt_id, (uint64_t)info.mnt_ns_id); continue; } - printf("mnt_id:\t\t%" PRIu64 "\nmnt_parent_id:\t%" PRIu64 "\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n\n", + printf("mnt_id:\t\t%" PRIu64 "\nmnt_parent_id:\t%" PRIu64 "\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n", (uint64_t)stmnt->mnt_id, (uint64_t)stmnt->mnt_parent_id, (stmnt->mask & STATMOUNT_FS_TYPE) ? stmnt->str + stmnt->fs_type : "", @@ -143,6 +145,26 @@ next: (stmnt->mask & STATMOUNT_MNT_POINT) ? stmnt->str + stmnt->mnt_point : "", (stmnt->mask & STATMOUNT_MNT_OPTS) ? stmnt->str + stmnt->mnt_opts : ""); + if (stmnt->mask & STATMOUNT_MNT_UIDMAP) { + const char *idmap = stmnt->str + stmnt->mnt_uidmap; + + for (size_t idx = 0; idx < stmnt->mnt_uidmap_num; idx++) { + printf("mnt_uidmap[%lu]:\t%s\n", idx, idmap); + idmap += strlen(idmap) + 1; + } + } + + if (stmnt->mask & STATMOUNT_MNT_GIDMAP) { + const char *idmap = stmnt->str + stmnt->mnt_gidmap; + + for (size_t idx = 0; idx < stmnt->mnt_gidmap_num; idx++) { + printf("mnt_gidmap[%lu]:\t%s\n", idx, idmap); + idmap += strlen(idmap) + 1; + } + } + + printf("\n"); + free(stmnt); } } -- GitLab From 8f6116b5b77b0536d2ad7482ee42bfe58b8fac01 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 6 Feb 2025 07:51:52 -0500 Subject: [PATCH 219/934] statmount: add a new supported_mask field Some of the fields in the statmount() reply can be optional. If the kernel has nothing to emit in that field, then it doesn't set the flag in the reply. This presents a problem: There is currently no way to know what mask flags the kernel supports since you can't always count on them being in the reply. Add a new STATMOUNT_SUPPORTED_MASK flag and field that the kernel can set in the reply. Userland can use this to determine if the fields it requires from the kernel are supported. This also gives us a way to deprecate fields in the future, if that should become necessary. Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20250206-statmount-v2-1-6ae70a21c2ab@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 23 +++++++++++++++++++++++ include/uapi/linux/mount.h | 4 +++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index d470a369d42bc..c5ada826d4889 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5410,6 +5410,21 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) return 0; } +/* This must be updated whenever a new flag is added */ +#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \ + STATMOUNT_MNT_BASIC | \ + STATMOUNT_PROPAGATE_FROM | \ + STATMOUNT_MNT_ROOT | \ + STATMOUNT_MNT_POINT | \ + STATMOUNT_FS_TYPE | \ + STATMOUNT_MNT_NS_ID | \ + STATMOUNT_MNT_OPTS | \ + STATMOUNT_FS_SUBTYPE | \ + STATMOUNT_SB_SOURCE | \ + STATMOUNT_OPT_ARRAY | \ + STATMOUNT_OPT_SEC_ARRAY | \ + STATMOUNT_SUPPORTED_MASK) + static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { @@ -5479,9 +5494,17 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); + if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) { + s->sm.mask |= STATMOUNT_SUPPORTED_MASK; + s->sm.supported_mask = STATMOUNT_SUPPORTED; + } + if (err) return err; + /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */ + WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask); + return 0; } diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index c07008816acae..c553dc4ba6840 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -179,7 +179,8 @@ struct statmount { __u32 opt_array; /* [str] Array of nul terminated fs options */ __u32 opt_sec_num; /* Number of security options */ __u32 opt_sec_array; /* [str] Array of nul terminated security options */ - __u64 __spare2[46]; + __u64 supported_mask; /* Mask flags that this kernel supports */ + __u64 __spare2[45]; char str[]; /* Variable size part containing strings */ }; @@ -217,6 +218,7 @@ struct mnt_id_req { #define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */ #define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */ #define STATMOUNT_OPT_SEC_ARRAY 0x00000800U /* Want/got opt_sec... */ +#define STATMOUNT_SUPPORTED_MASK 0x00001000U /* Want/got supported mask flags */ /* * Special @mnt_id values that can be passed to listmount -- GitLab From 901766df440f33b5aa30a491dc3e78655a627041 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 28 Jan 2025 11:33:39 +0100 Subject: [PATCH 220/934] fs: add vfs_open_tree() helper Split out vfs_open_tree() from open_tree() so we can use it in later patches. Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-1-c25feb0d2eb3@kernel.org Reviewed-by: "Seth Forshee (DigitalOcean)" Signed-off-by: Christian Brauner --- fs/namespace.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index c5ada826d4889..7604cdacba20b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3002,24 +3002,22 @@ static struct file *open_detached_copy(struct path *path, bool recursive) return file; } -SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) +static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags) { - struct file *file; - struct path path; + int ret; + struct path path __free(path_put) = {}; int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; bool detached = flags & OPEN_TREE_CLONE; - int error; - int fd; BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC)) - return -EINVAL; + return ERR_PTR(-EINVAL); if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) - return -EINVAL; + return ERR_PTR(-EINVAL); if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; @@ -3029,27 +3027,32 @@ SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, fl lookup_flags |= LOOKUP_EMPTY; if (detached && !may_mount()) - return -EPERM; + return ERR_PTR(-EPERM); + + ret = user_path_at(dfd, filename, lookup_flags, &path); + if (unlikely(ret)) + return ERR_PTR(ret); + + if (detached) + return open_detached_copy(&path, flags & AT_RECURSIVE); + + return dentry_open(&path, O_PATH, current_cred()); +} + +SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) +{ + int fd; + struct file *file __free(fput) = NULL; + + file = vfs_open_tree(dfd, filename, flags); + if (IS_ERR(file)) + return PTR_ERR(file); fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; - error = user_path_at(dfd, filename, lookup_flags, &path); - if (unlikely(error)) { - file = ERR_PTR(error); - } else { - if (detached) - file = open_detached_copy(&path, flags & AT_RECURSIVE); - else - file = dentry_open(&path, O_PATH, current_cred()); - path_put(&path); - } - if (IS_ERR(file)) { - put_unused_fd(fd); - return PTR_ERR(file); - } - fd_install(fd, file); + fd_install(fd, no_free_ptr(file)); return fd; } -- GitLab From 474f7825d5335798742b92f067e1d22365013107 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 28 Jan 2025 11:33:40 +0100 Subject: [PATCH 221/934] fs: add copy_mount_setattr() helper Split out copy_mount_setattr() from mount_setattr() so we can use it in later patches. Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-2-c25feb0d2eb3@kernel.org Reviewed-by: "Seth Forshee (DigitalOcean)" Signed-off-by: Christian Brauner --- fs/namespace.c | 73 +++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 7604cdacba20b..d2ef1d69839be 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4814,7 +4814,7 @@ out: } static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, - struct mount_kattr *kattr, unsigned int flags) + struct mount_kattr *kattr) { struct ns_common *ns; struct user_namespace *mnt_userns; @@ -4865,22 +4865,8 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, } static int build_mount_kattr(const struct mount_attr *attr, size_t usize, - struct mount_kattr *kattr, unsigned int flags) + struct mount_kattr *kattr) { - unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; - - if (flags & AT_NO_AUTOMOUNT) - lookup_flags &= ~LOOKUP_AUTOMOUNT; - if (flags & AT_SYMLINK_NOFOLLOW) - lookup_flags &= ~LOOKUP_FOLLOW; - if (flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; - - *kattr = (struct mount_kattr) { - .lookup_flags = lookup_flags, - .recurse = !!(flags & AT_RECURSIVE), - }; - if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) return -EINVAL; if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) @@ -4928,7 +4914,7 @@ static int build_mount_kattr(const struct mount_attr *attr, size_t usize, return -EINVAL; } - return build_mount_idmapped(attr, usize, kattr, flags); + return build_mount_idmapped(attr, usize, kattr); } static void finish_mount_kattr(struct mount_kattr *kattr) @@ -4940,23 +4926,14 @@ static void finish_mount_kattr(struct mount_kattr *kattr) mnt_idmap_put(kattr->mnt_idmap); } -SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, - unsigned int, flags, struct mount_attr __user *, uattr, - size_t, usize) +static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize, + struct mount_kattr *kattr) { - int err; - struct path target; + int ret; struct mount_attr attr; - struct mount_kattr kattr; BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); - if (flags & ~(AT_EMPTY_PATH | - AT_RECURSIVE | - AT_SYMLINK_NOFOLLOW | - AT_NO_AUTOMOUNT)) - return -EINVAL; - if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) @@ -4965,9 +4942,9 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, if (!may_mount()) return -EPERM; - err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); - if (err) - return err; + ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); + if (ret) + return ret; /* Don't bother walking through the mounts if this is a nop. */ if (attr.attr_set == 0 && @@ -4975,7 +4952,37 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, attr.propagation == 0) return 0; - err = build_mount_kattr(&attr, usize, &kattr, flags); + return build_mount_kattr(&attr, usize, kattr); +} + +SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, + unsigned int, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + int err; + struct path target; + struct mount_kattr kattr; + unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + + if (flags & ~(AT_EMPTY_PATH | + AT_RECURSIVE | + AT_SYMLINK_NOFOLLOW | + AT_NO_AUTOMOUNT)) + return -EINVAL; + + if (flags & AT_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + kattr = (struct mount_kattr) { + .lookup_flags = lookup_flags, + .recurse = !!(flags & AT_RECURSIVE), + }; + + err = copy_mount_setattr(uattr, usize, &kattr); if (err) return err; -- GitLab From c4a16820d90199409c9bf01c4f794e1e9e8d8fd8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 28 Jan 2025 11:33:41 +0100 Subject: [PATCH 222/934] fs: add open_tree_attr() Add open_tree_attr() which allow to atomically create a detached mount tree and set mount options on it. If OPEN_TREE_CLONE is used this will allow the creation of a detached mount with a new set of mount options without it ever being exposed to userspace without that set of mount options applied. Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-3-c25feb0d2eb3@kernel.org Reviewed-by: "Seth Forshee (DigitalOcean)" Signed-off-by: Christian Brauner --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/tools/syscall_32.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + fs/namespace.c | 39 +++++++++++++++++++++ include/linux/syscalls.h | 4 +++ include/uapi/asm-generic/unistd.h | 4 ++- scripts/syscall.tbl | 1 + 20 files changed, 63 insertions(+), 1 deletion(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index c59d53d6d3f34..2dd6340de6b4e 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -506,3 +506,4 @@ 574 common getxattrat sys_getxattrat 575 common listxattrat sys_listxattrat 576 common removexattrat sys_removexattrat +577 common open_tree_attr sys_open_tree_attr diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 49eeb2ad8dbd8..27c1d5ebcd91c 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -481,3 +481,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl index 69a829912a05e..0765b3a8d6d60 100644 --- a/arch/arm64/tools/syscall_32.tbl +++ b/arch/arm64/tools/syscall_32.tbl @@ -478,3 +478,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index f5ed71f1910d0..9fe47112c586f 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -466,3 +466,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 680f568b77f2c..7b6e97828e552 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -472,3 +472,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 0b9b7e25b69ad..aa70e371bb54a 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -405,3 +405,4 @@ 464 n32 getxattrat sys_getxattrat 465 n32 listxattrat sys_listxattrat 466 n32 removexattrat sys_removexattrat +467 n32 open_tree_attr sys_open_tree_attr diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index c844cd5cda620..1e8c44c7b6149 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -381,3 +381,4 @@ 464 n64 getxattrat sys_getxattrat 465 n64 listxattrat sys_listxattrat 466 n64 removexattrat sys_removexattrat +467 n64 open_tree_attr sys_open_tree_attr diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 349b8aad1159f..114a5a1a62302 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -454,3 +454,4 @@ 464 o32 getxattrat sys_getxattrat 465 o32 listxattrat sys_listxattrat 466 o32 removexattrat sys_removexattrat +467 o32 open_tree_attr sys_open_tree_attr diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index d9fc94c869657..94df3cb957e9d 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -465,3 +465,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index d8b4ab78bef07..9a084bdb89269 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -557,3 +557,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index e9115b4d8b635..a4569b96ef06c 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -469,3 +469,4 @@ 464 common getxattrat sys_getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr sys_open_tree_attr diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index c8cad33bf250e..52a7652fcff63 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -470,3 +470,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 727f99d333b30..83e45eb6c095a 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -512,3 +512,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 4d0fb2fba7e20..3f0ec87d5db4e 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -472,3 +472,4 @@ 464 i386 getxattrat sys_getxattrat 465 i386 listxattrat sys_listxattrat 466 i386 removexattrat sys_removexattrat +467 i386 open_tree_attr sys_open_tree_attr diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5eb708bff1c79..cfb5ca41e30de 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -390,6 +390,7 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 37effc1b134ee..f657a77314f86 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -437,3 +437,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/fs/namespace.c b/fs/namespace.c index d2ef1d69839be..ac4ad746c7705 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4995,6 +4995,45 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, return err; } +SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, + unsigned, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + struct file __free(fput) *file = NULL; + int fd; + + if (!uattr && usize) + return -EINVAL; + + file = vfs_open_tree(dfd, filename, flags); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (uattr) { + int ret; + struct mount_kattr kattr = { + .recurse = !!(flags & AT_RECURSIVE), + }; + + ret = copy_mount_setattr(uattr, usize, &kattr); + if (ret) + return ret; + + ret = do_mount_setattr(&file->f_path, &kattr); + if (ret) + return ret; + + finish_mount_kattr(&kattr); + } + + fd = get_unused_fd_flags(flags & O_CLOEXEC); + if (fd < 0) + return fd; + + fd_install(fd, no_free_ptr(file)); + return fd; +} + int show_path(struct seq_file *m, struct dentry *root) { if (root->d_sb->s_op->show_path) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c6333204d4513..079ea1d09d85e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -951,6 +951,10 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); +asmlinkage long sys_open_tree_attr(int dfd, const char __user *path, + unsigned flags, + struct mount_attr __user *uattr, + size_t usize); asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path, int to_dfd, const char __user *to_path, unsigned int ms_flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 88dc393c2bca3..2892a45023af6 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -849,9 +849,11 @@ __SYSCALL(__NR_getxattrat, sys_getxattrat) __SYSCALL(__NR_listxattrat, sys_listxattrat) #define __NR_removexattrat 466 __SYSCALL(__NR_removexattrat, sys_removexattrat) +#define __NR_open_tree_attr 467 +__SYSCALL(__NR_open_tree_attr, sys_open_tree_attr) #undef __NR_syscalls -#define __NR_syscalls 467 +#define __NR_syscalls 468 /* * 32 bit systems traditionally used different diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl index ebbdb3c42e9f7..580b4e246aecd 100644 --- a/scripts/syscall.tbl +++ b/scripts/syscall.tbl @@ -407,3 +407,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr -- GitLab From 325cca846fe4ed20fa68c076e25878ea9d350515 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 28 Jan 2025 11:33:42 +0100 Subject: [PATCH 223/934] fs: add kflags member to struct mount_kattr Instead of using a boolean use a flag so we can add new flags in following patches. Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-4-c25feb0d2eb3@kernel.org Reviewed-by: "Seth Forshee (DigitalOcean)" Signed-off-by: Christian Brauner --- fs/namespace.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ac4ad746c7705..a6d3f2041fda8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -87,12 +87,16 @@ LIST_HEAD(notify_list); /* protected by namespace_sem */ static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ +enum mount_kattr_flags_t { + MOUNT_KATTR_RECURSE = (1 << 0), +}; + struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; unsigned int propagation; unsigned int lookup_flags; - bool recurse; + enum mount_kattr_flags_t kflags; struct user_namespace *mnt_userns; struct mnt_idmap *mnt_idmap; }; @@ -4672,7 +4676,7 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) break; } - if (!kattr->recurse) + if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) return 0; } @@ -4733,7 +4737,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); - if (!kattr->recurse) + if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) break; } touch_mnt_namespace(mnt->mnt_ns); @@ -4763,7 +4767,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) */ namespace_lock(); if (kattr->propagation == MS_SHARED) { - err = invent_group_ids(mnt, kattr->recurse); + err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE); if (err) { namespace_unlock(); return err; @@ -4979,9 +4983,11 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, kattr = (struct mount_kattr) { .lookup_flags = lookup_flags, - .recurse = !!(flags & AT_RECURSIVE), }; + if (flags & AT_RECURSIVE) + kattr.kflags |= MOUNT_KATTR_RECURSE; + err = copy_mount_setattr(uattr, usize, &kattr); if (err) return err; @@ -5011,9 +5017,10 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, if (uattr) { int ret; - struct mount_kattr kattr = { - .recurse = !!(flags & AT_RECURSIVE), - }; + struct mount_kattr kattr = {}; + + if (flags & AT_RECURSIVE) + kattr.kflags |= MOUNT_KATTR_RECURSE; ret = copy_mount_setattr(uattr, usize, &kattr); if (ret) -- GitLab From 2462651ffa76b87f9c2e4403ef6e6b89b703fb2f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 28 Jan 2025 11:33:43 +0100 Subject: [PATCH 224/934] fs: allow changing idmappings This patchset makes it possible to create a new idmapped mount from an already idmapped mount and to clear idmappings. // Create a first idmapped mount struct mount_attr attr = { .attr_set = MOUNT_ATTR_IDMAP .userns_fd = fd_userns }; fd_tree = open_tree(-EBADF, "/", OPEN_TREE_CLONE, &attr, sizeof(attr)); move_mount(fd_tree, "", -EBADF, "/mnt", MOVE_MOUNT_F_EMPTY_PATH); // Create a second idmapped mount from the first idmapped mount attr.attr_set = MOUNT_ATTR_IDMAP; attr.userns_fd = fd_userns2; fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr)); // Create a second non-idmapped mount from the first idmapped mount: memset(&attr, 0, sizeof(attr)); attr.attr_clr = MOUNT_ATTR_IDMAP; fd_tree2 = open_tree(-EBADF, "/mnt", OPEN_TREE_CLONE, &attr, sizeof(attr)); Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-5-c25feb0d2eb3@kernel.org Reviewed-by: "Seth Forshee (DigitalOcean)" Signed-off-by: Christian Brauner --- fs/namespace.c | 53 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index a6d3f2041fda8..1d8d14ca6c13d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -89,6 +89,7 @@ static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ enum mount_kattr_flags_t { MOUNT_KATTR_RECURSE = (1 << 0), + MOUNT_KATTR_IDMAP_REPLACE = (1 << 1), }; struct mount_kattr { @@ -4612,11 +4613,10 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) return -EINVAL; /* - * Once a mount has been idmapped we don't allow it to change its - * mapping. It makes things simpler and callers can just create - * another bind-mount they can idmap if they want to. + * We only allow an mount to change it's idmapping if it has + * never been accessible to userspace. */ - if (is_idmapped_mnt(m)) + if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m)) return -EPERM; /* The underlying filesystem doesn't support idmapped mounts yet. */ @@ -4706,18 +4706,16 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { + struct mnt_idmap *old_idmap; + if (!kattr->mnt_idmap) return; - /* - * Pairs with smp_load_acquire() in mnt_idmap(). - * - * Since we only allow a mount to change the idmapping once and - * verified this in can_idmap_mount() we know that the mount has - * @nop_mnt_idmap attached to it. So there's no need to drop any - * references. - */ + old_idmap = mnt_idmap(&mnt->mnt); + + /* Pairs with smp_load_acquire() in mnt_idmap(). */ smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap)); + mnt_idmap_put(old_idmap); } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) @@ -4826,13 +4824,23 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) return 0; - /* - * We currently do not support clearing an idmapped mount. If this ever - * is a use-case we can revisit this but for now let's keep it simple - * and not allow it. - */ - if (attr->attr_clr & MOUNT_ATTR_IDMAP) - return -EINVAL; + if (attr->attr_clr & MOUNT_ATTR_IDMAP) { + /* + * We can only remove an idmapping if it's never been + * exposed to userspace. + */ + if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE)) + return -EINVAL; + + /* + * Removal of idmappings is equivalent to setting + * nop_mnt_idmap. + */ + if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) { + kattr->mnt_idmap = &nop_mnt_idmap; + return 0; + } + } if (attr->userns_fd > INT_MAX) return -EINVAL; @@ -4923,8 +4931,10 @@ static int build_mount_kattr(const struct mount_attr *attr, size_t usize, static void finish_mount_kattr(struct mount_kattr *kattr) { - put_user_ns(kattr->mnt_userns); - kattr->mnt_userns = NULL; + if (kattr->mnt_userns) { + put_user_ns(kattr->mnt_userns); + kattr->mnt_userns = NULL; + } if (kattr->mnt_idmap) mnt_idmap_put(kattr->mnt_idmap); @@ -5019,6 +5029,7 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, int ret; struct mount_kattr kattr = {}; + kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; if (flags & AT_RECURSIVE) kattr.kflags |= MOUNT_KATTR_RECURSE; -- GitLab From ee5eda8ea59546af2e8f192c060fbf29862d7cbd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 10 Feb 2025 12:40:39 +0100 Subject: [PATCH 225/934] pipe: change pipe_write() to never add a zero-sized buffer a194dfe6e6f6 ("pipe: Rearrange sequence in pipe_write() to preallocate slot") changed pipe_write() to increment pipe->head in advance. IIUC to avoid the race with the post_one_notification()-like code which can add another buffer under pipe->rd_wait.lock without pipe->mutex. This is no longer necessary after c73be61cede5 ("pipe: Add general notification queue support"), pipe_write() checks pipe_has_watch_queue() and returns -EXDEV at the start. And can't help in any case, pipe_write() no longer takes this rd_wait.lock spinlock. Change pipe_write() to call copy_page_from_iter() first and do nothing if it fails. This way pipe_write() can't add a zero-sized buffer and we can simplify pipe_read() which currently has to take care of this very unlikely case. Also, with this patch we can probably kill eat_empty_buffer() and more "is this buffer empty" checks in fs/splice.c later. Link: https://lore.kernel.org/all/20250209150718.GA17013@redhat.com/ Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250210114039.GA3588@redhat.com Tested-by: K Prateek Nayak Signed-off-by: Christian Brauner --- fs/pipe.c | 45 +++++++++------------------------------------ 1 file changed, 9 insertions(+), 36 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 2ae75adfba64b..b0641f75b1bae 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -360,29 +360,9 @@ anon_pipe_read(struct kiocb *iocb, struct iov_iter *to) break; } mutex_unlock(&pipe->mutex); - /* * We only get here if we didn't actually read anything. * - * However, we could have seen (and removed) a zero-sized - * pipe buffer, and might have made space in the buffers - * that way. - * - * You can't make zero-sized pipe buffers by doing an empty - * write (not even in packet mode), but they can happen if - * the writer gets an EFAULT when trying to fill a buffer - * that already got allocated and inserted in the buffer - * array. - * - * So we still need to wake up any pending writers in the - * _very_ unlikely case that the pipe was full, but we got - * no data. - */ - if (unlikely(wake_writer)) - wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - - /* * But because we didn't read anything, at this point we can * just return directly with -ERESTARTSYS if we're interrupted, * since we've done any required wakeups and there's no need @@ -391,7 +371,6 @@ anon_pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - wake_writer = false; wake_next_reader = true; mutex_lock(&pipe->mutex); } @@ -526,33 +505,27 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) pipe->tmp_page = page; } - /* Allocate a slot in the ring in advance and attach an - * empty buffer. If we fault or otherwise fail to use - * it, either the reader will consume it or it'll still - * be there for the next write. - */ - pipe->head = head + 1; + copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); + if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { + if (!ret) + ret = -EFAULT; + break; + } + pipe->head = head + 1; + pipe->tmp_page = NULL; /* Insert it into the buffer array */ buf = &pipe->bufs[head & mask]; buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; - buf->len = 0; if (is_packetized(filp)) buf->flags = PIPE_BUF_FLAG_PACKET; else buf->flags = PIPE_BUF_FLAG_CAN_MERGE; - pipe->tmp_page = NULL; - copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); - if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { - if (!ret) - ret = -EFAULT; - break; - } - ret += copied; buf->len = copied; + ret += copied; if (!iov_iter_count(from)) break; -- GitLab From 16681bea9a80080765c98b545ad74c17de2d513c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 11 Feb 2025 12:03:52 +0100 Subject: [PATCH 226/934] selftests/nolibc: split up architecture list in run-tests.sh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The list is getting overly long and any modifications introduce a lot of noise and are prone to conflicts. Split the string into a bash array and break that into multiple lines. Link: https://lore.kernel.org/r/20250211-nolibc-test-archs-v1-1-8e55aa3369cf@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/run-tests.sh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index bc4e92b4f1b98..6db0111527688 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -17,7 +17,16 @@ perform_download=0 test_mode=system werror=1 llvm= -archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv32 riscv64 s390 loongarch" +all_archs=( + i386 x86_64 + arm64 arm + mips32le mips32be + ppc ppc64 ppc64le + riscv32 riscv64 + s390 + loongarch +) +archs="${all_archs[@]}" TEMP=$(getopt -o 'j:d:c:b:a:m:pelh' -n "$0" -- "$@") -- GitLab From 2be6ce9d9bd040780dda8d456fe8696a48d805be Mon Sep 17 00:00:00 2001 From: Gabriela Bittencourt Date: Mon, 2 Dec 2024 15:55:43 +0800 Subject: [PATCH 227/934] unicode: kunit: change tests filename and path Change utf8 kunit test filename and path to follow the style convention on Documentation/dev-tools/kunit/style.rst Co-developed-by: Pedro Orlando Signed-off-by: Pedro Orlando Co-developed-by: Danilo Pereira Signed-off-by: Danilo Pereira Signed-off-by: Gabriela Bittencourt Reviewed-by: David Gow Acked-by: Gabriel Krisman Bertazi Reviewed-by: Shuah Khan Reviewed-by: Rae Moar Link: https://lore.kernel.org/r/20241202075545.3648096-7-davidgow@google.com Signed-off-by: Kees Cook --- fs/unicode/Makefile | 2 +- fs/unicode/{ => tests}/.kunitconfig | 0 fs/unicode/{utf8-selftest.c => tests/utf8_kunit.c} | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename fs/unicode/{ => tests}/.kunitconfig (100%) rename fs/unicode/{utf8-selftest.c => tests/utf8_kunit.c} (99%) diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index 37bbcbc628a13..d95be7fb9f6b7 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -4,7 +4,7 @@ ifneq ($(CONFIG_UNICODE),) obj-y += unicode.o endif obj-$(CONFIG_UNICODE) += utf8data.o -obj-$(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) += utf8-selftest.o +obj-$(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) += tests/utf8_kunit.o unicode-y := utf8-norm.o utf8-core.o diff --git a/fs/unicode/.kunitconfig b/fs/unicode/tests/.kunitconfig similarity index 100% rename from fs/unicode/.kunitconfig rename to fs/unicode/tests/.kunitconfig diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/tests/utf8_kunit.c similarity index 99% rename from fs/unicode/utf8-selftest.c rename to fs/unicode/tests/utf8_kunit.c index 9476ab012baa1..5063e8138aeca 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/tests/utf8_kunit.c @@ -8,7 +8,7 @@ #include #include -#include "utf8n.h" +#include "../utf8n.h" static const struct { /* UTF-8 strings in this vector _must_ be NULL-terminated. */ -- GitLab From 9ab61886ac683e8ff1b261a1738d5e68cd645604 Mon Sep 17 00:00:00 2001 From: Yu-Chun Lin Date: Mon, 3 Feb 2025 15:54:00 +0800 Subject: [PATCH 228/934] lib/math: Add Kunit test suite for gcd() Add a KUnit test suite for the gcd() function. This test suite verifies the correctness of gcd() across various scenarios, including edge cases. Signed-off-by: Yu-Chun Lin Reviewed-by: Kuan-Wei Chiu Link: https://lore.kernel.org/r/20250203075400.3431330-1-eleanor15x@gmail.com Signed-off-by: Kees Cook --- lib/Kconfig.debug | 13 +++++++++ lib/math/tests/Makefile | 1 + lib/math/tests/gcd_kunit.c | 56 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 lib/math/tests/gcd_kunit.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 86ddf568cbca5..ad17254658702 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -3222,6 +3222,19 @@ config INT_LOG_KUNIT_TEST If unsure, say N +config GCD_KUNIT_TEST + tristate "Greatest common divisor test" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This option enables the KUnit test suite for the gcd() function, + which computes the greatest common divisor of two numbers. + + This test suite verifies the correctness of gcd() across various + scenarios, including edge cases. + + If unsure, say N + endif # RUNTIME_TESTING_MENU config ARCH_USE_MEMTEST diff --git a/lib/math/tests/Makefile b/lib/math/tests/Makefile index 14a245778394d..1d2dd6424df89 100644 --- a/lib/math/tests/Makefile +++ b/lib/math/tests/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_GCD_KUNIT_TEST) += gcd_kunit.o obj-$(CONFIG_INT_LOG_KUNIT_TEST) += int_log_kunit.o obj-$(CONFIG_INT_POW_KUNIT_TEST) += int_pow_kunit.o obj-$(CONFIG_INT_SQRT_KUNIT_TEST) += int_sqrt_kunit.o diff --git a/lib/math/tests/gcd_kunit.c b/lib/math/tests/gcd_kunit.c new file mode 100644 index 0000000000000..ede1883583b1d --- /dev/null +++ b/lib/math/tests/gcd_kunit.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +struct test_case_params { + unsigned long val1; + unsigned long val2; + unsigned long expected_result; + const char *name; +}; + +static const struct test_case_params params[] = { + { 48, 18, 6, "GCD of 48 and 18" }, + { 18, 48, 6, "GCD of 18 and 48" }, + { 56, 98, 14, "GCD of 56 and 98" }, + { 17, 13, 1, "Coprime numbers" }, + { 101, 103, 1, "Coprime numbers" }, + { 270, 192, 6, "GCD of 270 and 192" }, + { 0, 5, 5, "GCD with zero" }, + { 7, 0, 7, "GCD with zero reversed" }, + { 36, 36, 36, "GCD of identical numbers" }, + { ULONG_MAX, 1, 1, "GCD of max ulong and 1" }, + { ULONG_MAX, ULONG_MAX, ULONG_MAX, "GCD of max ulong values" }, +}; + +static void get_desc(const struct test_case_params *tc, char *desc) +{ + strscpy(desc, tc->name, KUNIT_PARAM_DESC_SIZE); +} + +KUNIT_ARRAY_PARAM(gcd, params, get_desc); + +static void gcd_test(struct kunit *test) +{ + const struct test_case_params *tc = (const struct test_case_params *)test->param_value; + + KUNIT_EXPECT_EQ(test, tc->expected_result, gcd(tc->val1, tc->val2)); +} + +static struct kunit_case math_gcd_test_cases[] = { + KUNIT_CASE_PARAM(gcd_test, gcd_gen_params), + {} +}; + +static struct kunit_suite gcd_test_suite = { + .name = "math-gcd", + .test_cases = math_gcd_test_cases, +}; + +kunit_test_suite(gcd_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("math.gcd KUnit test suite"); +MODULE_AUTHOR("Yu-Chun Lin "); -- GitLab From 313b38a6ecb46db4002925af91b64df4f2b76d1f Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Sat, 8 Feb 2025 21:44:39 -0500 Subject: [PATCH 229/934] lib/prime_numbers: convert self-test to KUnit Extract a private header and convert the prime_numbers self-test to a KUnit test. I considered parameterizing the test using `KUNIT_CASE_PARAM` but didn't see how it was possible since the test logic is entangled with the test parameter generation logic. Signed-off-by: Tamir Duberstein Link: https://lore.kernel.org/r/20250208-prime_numbers-kunit-convert-v5-2-b0cb82ae7c7d@gmail.com Signed-off-by: Kees Cook --- lib/Kconfig.debug | 14 +++ lib/math/prime_numbers.c | 91 ++++---------------- lib/math/prime_numbers_private.h | 16 ++++ lib/math/tests/Makefile | 1 + lib/math/tests/prime_numbers_kunit.c | 59 +++++++++++++ tools/testing/selftests/lib/config | 1 - tools/testing/selftests/lib/prime_numbers.sh | 4 - 7 files changed, 109 insertions(+), 77 deletions(-) create mode 100644 lib/math/prime_numbers_private.h create mode 100644 lib/math/tests/prime_numbers_kunit.c delete mode 100755 tools/testing/selftests/lib/prime_numbers.sh diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ad17254658702..7e548b9e36b71 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -3235,6 +3235,20 @@ config GCD_KUNIT_TEST If unsure, say N +config PRIME_NUMBERS_KUNIT_TEST + tristate "Prime number generator test" if !KUNIT_ALL_TESTS + depends on KUNIT + select PRIME_NUMBERS + default KUNIT_ALL_TESTS + help + This option enables the KUnit test suite for the {is,next}_prime_number + functions. + + Enabling this option will include tests that compare the prime number + generator functions against a brute force implementation. + + If unsure, say N + endif # RUNTIME_TESTING_MENU config ARCH_USE_MEMTEST diff --git a/lib/math/prime_numbers.c b/lib/math/prime_numbers.c index 9a17ee9af93a1..95a6f7960db9e 100644 --- a/lib/math/prime_numbers.c +++ b/lib/math/prime_numbers.c @@ -1,16 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-only -#define pr_fmt(fmt) "prime numbers: " fmt #include #include #include #include -struct primes { - struct rcu_head rcu; - unsigned long last, sz; - unsigned long primes[]; -}; +#include "prime_numbers_private.h" #if BITS_PER_LONG == 64 static const struct primes small_primes = { @@ -62,9 +57,25 @@ static const struct primes small_primes = { static DEFINE_MUTEX(lock); static const struct primes __rcu *primes = RCU_INITIALIZER(&small_primes); -static unsigned long selftest_max; +#if IS_ENABLED(CONFIG_PRIME_NUMBERS_KUNIT_TEST) +/* + * Calls the callback under RCU lock. The callback must not retain + * the primes pointer. + */ +void with_primes(void *ctx, primes_fn fn) +{ + rcu_read_lock(); + fn(ctx, rcu_dereference(primes)); + rcu_read_unlock(); +} +EXPORT_SYMBOL(with_primes); + +EXPORT_SYMBOL(slow_is_prime_number); -static bool slow_is_prime_number(unsigned long x) +#else +static +#endif +bool slow_is_prime_number(unsigned long x) { unsigned long y = int_sqrt(x); @@ -239,77 +250,13 @@ bool is_prime_number(unsigned long x) } EXPORT_SYMBOL(is_prime_number); -static void dump_primes(void) -{ - const struct primes *p; - char *buf; - - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - - rcu_read_lock(); - p = rcu_dereference(primes); - - if (buf) - bitmap_print_to_pagebuf(true, buf, p->primes, p->sz); - pr_info("primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s\n", - p->last, p->sz, p->primes[BITS_TO_LONGS(p->sz) - 1], buf); - - rcu_read_unlock(); - - kfree(buf); -} - -static int selftest(unsigned long max) -{ - unsigned long x, last; - - if (!max) - return 0; - - for (last = 0, x = 2; x < max; x++) { - bool slow = slow_is_prime_number(x); - bool fast = is_prime_number(x); - - if (slow != fast) { - pr_err("inconsistent result for is-prime(%lu): slow=%s, fast=%s!\n", - x, slow ? "yes" : "no", fast ? "yes" : "no"); - goto err; - } - - if (!slow) - continue; - - if (next_prime_number(last) != x) { - pr_err("incorrect result for next-prime(%lu): expected %lu, got %lu\n", - last, x, next_prime_number(last)); - goto err; - } - last = x; - } - - pr_info("%s(%lu) passed, last prime was %lu\n", __func__, x, last); - return 0; - -err: - dump_primes(); - return -EINVAL; -} - -static int __init primes_init(void) -{ - return selftest(selftest_max); -} - static void __exit primes_exit(void) { free_primes(); } -module_init(primes_init); module_exit(primes_exit); -module_param_named(selftest, selftest_max, ulong, 0400); - MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION("Prime number library"); MODULE_LICENSE("GPL"); diff --git a/lib/math/prime_numbers_private.h b/lib/math/prime_numbers_private.h new file mode 100644 index 0000000000000..f3ebf5386e6b4 --- /dev/null +++ b/lib/math/prime_numbers_private.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include + +struct primes { + struct rcu_head rcu; + unsigned long last, sz; + unsigned long primes[]; +}; + +#if IS_ENABLED(CONFIG_PRIME_NUMBERS_KUNIT_TEST) +typedef void (*primes_fn)(void *, const struct primes *); + +void with_primes(void *ctx, primes_fn fn); +bool slow_is_prime_number(unsigned long x); +#endif diff --git a/lib/math/tests/Makefile b/lib/math/tests/Makefile index 1d2dd6424df89..13dc96e48408b 100644 --- a/lib/math/tests/Makefile +++ b/lib/math/tests/Makefile @@ -4,4 +4,5 @@ obj-$(CONFIG_GCD_KUNIT_TEST) += gcd_kunit.o obj-$(CONFIG_INT_LOG_KUNIT_TEST) += int_log_kunit.o obj-$(CONFIG_INT_POW_KUNIT_TEST) += int_pow_kunit.o obj-$(CONFIG_INT_SQRT_KUNIT_TEST) += int_sqrt_kunit.o +obj-$(CONFIG_PRIME_NUMBERS_KUNIT_TEST) += prime_numbers_kunit.o obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational_kunit.o diff --git a/lib/math/tests/prime_numbers_kunit.c b/lib/math/tests/prime_numbers_kunit.c new file mode 100644 index 0000000000000..2f1643208c661 --- /dev/null +++ b/lib/math/tests/prime_numbers_kunit.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +#include "../prime_numbers_private.h" + +static void dump_primes(void *ctx, const struct primes *p) +{ + static char buf[PAGE_SIZE]; + struct kunit_suite *suite = ctx; + + bitmap_print_to_pagebuf(true, buf, p->primes, p->sz); + kunit_info(suite, "primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s", + p->last, p->sz, p->primes[BITS_TO_LONGS(p->sz) - 1], buf); +} + +static void prime_numbers_test(struct kunit *test) +{ + const unsigned long max = 65536; + unsigned long x, last, next; + + for (last = 0, x = 2; x < max; x++) { + const bool slow = slow_is_prime_number(x); + const bool fast = is_prime_number(x); + + KUNIT_ASSERT_EQ_MSG(test, slow, fast, "is-prime(%lu)", x); + + if (!slow) + continue; + + next = next_prime_number(last); + KUNIT_ASSERT_EQ_MSG(test, next, x, "next-prime(%lu)", last); + last = next; + } +} + +static void kunit_suite_exit(struct kunit_suite *suite) +{ + with_primes(suite, dump_primes); +} + +static struct kunit_case prime_numbers_cases[] = { + KUNIT_CASE(prime_numbers_test), + {}, +}; + +static struct kunit_suite prime_numbers_suite = { + .name = "math-prime_numbers", + .suite_exit = kunit_suite_exit, + .test_cases = prime_numbers_cases, +}; + +kunit_test_suite(prime_numbers_suite); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("Prime number library"); +MODULE_LICENSE("GPL"); diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config index dc15aba8d0a3d..306a3d4dca987 100644 --- a/tools/testing/selftests/lib/config +++ b/tools/testing/selftests/lib/config @@ -1,5 +1,4 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m -CONFIG_PRIME_NUMBERS=m CONFIG_TEST_BITOPS=m diff --git a/tools/testing/selftests/lib/prime_numbers.sh b/tools/testing/selftests/lib/prime_numbers.sh deleted file mode 100755 index 370b79a9cb2e9..0000000000000 --- a/tools/testing/selftests/lib/prime_numbers.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# Checks fast/slow prime_number generation for inconsistencies -$(dirname $0)/../kselftest/module.sh "prime numbers" prime_numbers selftest=65536 -- GitLab From b09d96e0847b6882891a49a674ddc5e4f6c6b9dd Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 12 Feb 2025 22:08:01 +0100 Subject: [PATCH 230/934] docs: ABI: drop two duplicate symbols As warned by get_abi.py, there are two symbols that are defined twice: WARNING: /sys/devices/system/cpu/cpuX/topology/physical_package_id is defined 2 times: \ /new_devel/v4l/docs/Documentation/ABI/stable/sysfs-devices-system-cpu:27; \ /new_devel/v4l/docs/Documentation/ABI/testing/sysfs-devices-system-cpu:70 WARNING: /sys/devices/system/cpu/cpuX/topology/ppin is defined 2 times: \ /new_devel/v4l/docs/Documentation/ABI/stable/sysfs-devices-system-cpu:89; \ /new_devel/v4l/docs/Documentation/ABI/testing/sysfs-devices-system-cpu:70 As the documentation at testing/sysfs-devices-system-cpu, drop the duplicated one from stable. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/c3dce809f577584cf9aedafc6c2a0d5a9ca909ac.1739394480.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/ABI/stable/sysfs-devices-system-cpu | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-devices-system-cpu b/Documentation/ABI/stable/sysfs-devices-system-cpu index 902392d7eddf0..cf78bd99f6c86 100644 --- a/Documentation/ABI/stable/sysfs-devices-system-cpu +++ b/Documentation/ABI/stable/sysfs-devices-system-cpu @@ -24,12 +24,6 @@ Description: Default value for the Data Stream Control Register (DSCR) on If set by a process it will be inherited by child processes. Values: 64 bit unsigned integer (bit field) -What: /sys/devices/system/cpu/cpuX/topology/physical_package_id -Description: physical package id of cpuX. Typically corresponds to a physical - socket number, but the actual value is architecture and platform - dependent. -Values: integer - What: /sys/devices/system/cpu/cpuX/topology/die_id Description: the CPU die ID of cpuX. Typically it is the hardware platform's identifier (rather than the kernel's). The actual value is @@ -86,10 +80,6 @@ What: /sys/devices/system/cpu/cpuX/topology/die_cpus Description: internal kernel map of CPUs within the same die. Values: hexadecimal bitmask. -What: /sys/devices/system/cpu/cpuX/topology/ppin -Description: per-socket protected processor inventory number -Values: hexadecimal. - What: /sys/devices/system/cpu/cpuX/topology/die_cpus_list Description: human-readable list of CPUs within the same die. The format is like 0-3, 8-11, 14,17. -- GitLab From ba561b485709b6160e56d1fe32a2717fffb332cc Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 12 Feb 2025 07:02:52 +0100 Subject: [PATCH 231/934] scripts/kernel-doc: remove an obscure logic from kernel-doc Kernel-doc has an obscure logic that uses an external file to map files via a .tmp_filelist.txt file stored at the current directory. The rationale for such code predates git time, as it was added on Kernel v2.4.5.5, with the following description: # 26/05/2001 - Support for separate source and object trees. # Return error code. # Keith Owens from commit 396a6123577d ("v2.4.5.4 -> v2.4.5.5") at the historic tree: https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/ Support for separate source and object trees is now done on a different way via make O=. There's no logic to create such file, so it sounds to me that this is just dead code. So, drop it. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/fd3b28dec36ba1668325d6770d4c4754414337fc.1739340170.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/kernel-doc | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index e57c5e989a0a8..70da9a3369c6d 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -179,7 +179,7 @@ my ($function, %function_table, %parametertypes, $declaration_purpose); my %nosymbol_table = (); my $declaration_start_line; my ($type, $declaration_name, $return_type); -my ($newsection, $newcontents, $prototype, $brcount, %source_map); +my ($newsection, $newcontents, $prototype, $brcount); if (defined($ENV{'KBUILD_VERBOSE'}) && $ENV{'KBUILD_VERBOSE'} =~ '1') { $verbose = 1; @@ -2005,10 +2005,6 @@ sub map_filename($) { $file = $orig_file; } - if (defined($source_map{$file})) { - $file = $source_map{$file}; - } - return $file; } @@ -2403,19 +2399,6 @@ for (my $k = 0; $k < @highlights; $k++) { $dohighlight .= "\$contents =~ s:$pattern:$result:gs;\n"; } -# Read the file that maps relative names to absolute names for -# separate source and object directories and for shadow trees. -if (open(SOURCE_MAP, "<.tmp_filelist.txt")) { - my ($relname, $absname); - while() { - chop(); - ($relname, $absname) = (split())[0..1]; - $relname =~ s:^/+::; - $source_map{$relname} = $absname; - } - close(SOURCE_MAP); -} - if ($output_selection == OUTPUT_EXPORTED || $output_selection == OUTPUT_INTERNAL) { -- GitLab From b126dbf52e981acbc25a1cb7b2db73195263a1fd Mon Sep 17 00:00:00 2001 From: Aditya Dutt Date: Tue, 11 Feb 2025 16:00:02 +0530 Subject: [PATCH 232/934] Documentation/driver-api: fixed spelling mistakes Fixed some spelling mistakes identified by misspell tool. The example code in Documentation/driver-api/nvdimm/nvdimm.rst contained a misspelled identifier (paramaters instead of parameters). This typo would have caused a compilation error if copied as-is. Signed-off-by: Aditya Dutt Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20250211103002.199004-1-duttaditya18@gmail.com --- Documentation/driver-api/media/drivers/zoran.rst | 2 +- Documentation/driver-api/media/maintainer-entry-profile.rst | 2 +- Documentation/driver-api/nvdimm/nvdimm.rst | 6 +++--- Documentation/driver-api/pm/devices.rst | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/driver-api/media/drivers/zoran.rst b/Documentation/driver-api/media/drivers/zoran.rst index b205e10c31546..3e05b7f0442a1 100644 --- a/Documentation/driver-api/media/drivers/zoran.rst +++ b/Documentation/driver-api/media/drivers/zoran.rst @@ -222,7 +222,7 @@ The CCIR - I uses the PAL colorsystem, and is used in Great Britain, Hong Kong, Ireland, Nigeria, South Africa. The CCIR - N uses the PAL colorsystem and PAL frame size but the NTSC framerate, -and is used in Argentinia, Uruguay, an a few others +and is used in Argentina, Uruguay, an a few others We do not talk about how the audio is broadcast ! diff --git a/Documentation/driver-api/media/maintainer-entry-profile.rst b/Documentation/driver-api/media/maintainer-entry-profile.rst index ffc712a5f6328..ad96a89ee9160 100644 --- a/Documentation/driver-api/media/maintainer-entry-profile.rst +++ b/Documentation/driver-api/media/maintainer-entry-profile.rst @@ -116,7 +116,7 @@ CEC drivers ``cec-compliance`` .. [3] The ``v4l2-compliance`` also covers the media controller usage inside V4L2 drivers. -Other compilance tools are under development to check other parts of the +Other compliance tools are under development to check other parts of the subsystem. Those tests need to pass before the patches go upstream. diff --git a/Documentation/driver-api/nvdimm/nvdimm.rst b/Documentation/driver-api/nvdimm/nvdimm.rst index ca16b5acbf30d..c205efa4d45b0 100644 --- a/Documentation/driver-api/nvdimm/nvdimm.rst +++ b/Documentation/driver-api/nvdimm/nvdimm.rst @@ -535,12 +535,12 @@ internally with a static identifier:: char devname[50]; snprintf(devname, sizeof(devname), "namespace%d.%d", - ndctl_region_get_id(region), paramaters->id); + ndctl_region_get_id(region), parameters->id); ndctl_namespace_set_alt_name(ndns, devname); /* 'uuid' must be set prior to setting size! */ - ndctl_namespace_set_uuid(ndns, paramaters->uuid); - ndctl_namespace_set_size(ndns, paramaters->size); + ndctl_namespace_set_uuid(ndns, parameters->uuid); + ndctl_namespace_set_size(ndns, parameters->size); /* unlike pmem namespaces, blk namespaces have a sector size */ if (parameters->lbasize) ndctl_namespace_set_sector_size(ndns, parameters->lbasize); diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst index d448cb57df86c..8d86d5da4023a 100644 --- a/Documentation/driver-api/pm/devices.rst +++ b/Documentation/driver-api/pm/devices.rst @@ -358,7 +358,7 @@ the phases are: ``prepare``, ``suspend``, ``suspend_late``, ``suspend_noirq``. is probed against the device in question by passing them to the :c:func:`dev_pm_set_driver_flags` helper function.] If the first of these flags is set, the PM core will not apply the direct-complete - procedure described above to the given device and, consequenty, to any + procedure described above to the given device and, consequently, to any of its ancestors. The second flag, when set, informs the middle layer code (bus types, device types, PM domains, classes) that it should take the return value of the ``->prepare`` callback provided by the driver -- GitLab From df60e5290599116f7636696a46d738a489e2dc83 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 11 Feb 2025 06:44:06 +0100 Subject: [PATCH 233/934] docs: Makefile: use the new script to check for bad ABI references The get_abi.pl script was replaced by get_abi.py. Update it at docs makefile. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502110736.ZGWaWsep-lkp@intel.com/ Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/20250211054446.1696826-1-mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/Makefile b/Documentation/Makefile index 52c6c5a3efa99..63094646df289 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -12,7 +12,7 @@ endif # Check for broken ABI files ifeq ($(CONFIG_WARN_ABI_ERRORS),y) -$(shell $(srctree)/scripts/get_abi.pl validate --dir $(srctree)/Documentation/ABI) +$(shell $(srctree)/scripts/get_abi.py --dir $(srctree)/Documentation/ABI validate) endif # You can set these variables from the command line. -- GitLab From 6a0c4b61e13f9ab1d6395823fb06b0763eb370bd Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 11 Feb 2025 08:00:46 +0100 Subject: [PATCH 234/934] docs: trace: decode_msr.py: make it compatible with python 3 This script uses print instead of print(foo), which is incompatible with Python 3. Fix it. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/88bb0d47100feaa3cda215e68bf6500dc67da7b3.1739257245.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/trace/postprocess/decode_msr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/trace/postprocess/decode_msr.py b/Documentation/trace/postprocess/decode_msr.py index aa9cc7abd5c2b..f5609b16f5898 100644 --- a/Documentation/trace/postprocess/decode_msr.py +++ b/Documentation/trace/postprocess/decode_msr.py @@ -32,6 +32,6 @@ for j in sys.stdin: break if r: j = j.replace(" " + m.group(2), " " + r + "(" + m.group(2) + ")") - print j, + print(j) -- GitLab From de61d6515baece4610e401d9d7c18cac6bd77198 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 11 Feb 2025 06:57:57 +0100 Subject: [PATCH 235/934] docs: ABI: move README contents to the top The ABI documentation looks a little bit better if it starts with the contents of the README is placed at the beginning. Move it to the beginning of the ABI chapter. While here, improve the README text and change the title that will be shown at the html/pdf output to be coherent with both ABI file contents and with the generated documentation output. Suggested-by: Jonathan Corbet Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/20250211055809.1898623-1-mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/ABI/README | 3 ++- Documentation/admin-guide/abi-readme-file.rst | 6 ------ Documentation/admin-guide/abi.rst | 3 ++- scripts/lib/abi/abi_parser.py | 2 +- 4 files changed, 5 insertions(+), 9 deletions(-) delete mode 100644 Documentation/admin-guide/abi-readme-file.rst diff --git a/Documentation/ABI/README b/Documentation/ABI/README index 8bac9cb09a6de..ef0e6d11e919c 100644 --- a/Documentation/ABI/README +++ b/Documentation/ABI/README @@ -1,4 +1,5 @@ -This directory attempts to document the ABI between the Linux kernel and +This part of the documentation inside Documentation/ABI directory +attempts to document the ABI between the Linux kernel and userspace, and the relative stability of these interfaces. Due to the everchanging nature of Linux, and the differing maturity levels, these interfaces should be used by userspace programs in different ways. diff --git a/Documentation/admin-guide/abi-readme-file.rst b/Documentation/admin-guide/abi-readme-file.rst deleted file mode 100644 index 6172e4ccbda2a..0000000000000 --- a/Documentation/admin-guide/abi-readme-file.rst +++ /dev/null @@ -1,6 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -ABI README -========== - -.. kernel-abi:: README diff --git a/Documentation/admin-guide/abi.rst b/Documentation/admin-guide/abi.rst index 15a2dcb1388ca..c6039359e5853 100644 --- a/Documentation/admin-guide/abi.rst +++ b/Documentation/admin-guide/abi.rst @@ -4,6 +4,8 @@ Linux ABI description ===================== +.. kernel-abi:: README + ABI symbols ----------- @@ -21,7 +23,6 @@ ABI files .. toctree:: :maxdepth: 2 - abi-readme-file abi-stable-files abi-testing-files abi-obsolete-files diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py index f08de6d3bf7c1..66a738013ce13 100644 --- a/scripts/lib/abi/abi_parser.py +++ b/scripts/lib/abi/abi_parser.py @@ -266,7 +266,7 @@ class AbiParser: def parse_readme(self, nametag, fname): """Parse ABI README file""" - nametag["what"] = ["ABI file contents"] + nametag["what"] = ["Introduction"] nametag["path"] = "README" with open(fname, "r", encoding="utf8", errors="backslashreplace") as fp: for line in fp: -- GitLab From 3539c6411a7c9d6c5895f78750f93160705cd250 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2025 13:08:31 -1000 Subject: [PATCH 236/934] sched_ext: Implement SCX_OPS_ALLOW_QUEUED_WAKEUP A task wakeup can be either processed on the waker's CPU or bounced to the wakee's previous CPU using an IPI (ttwu_queue). Bouncing to the wakee's CPU avoids the waker's CPU locking and accessing the wakee's rq which can be expensive across cache and node boundaries. When ttwu_queue path is taken, select_task_rq() and thus ops.select_cpu() may be skipped in some cases (racing against the wakee switching out). As this confused some BPF schedulers, there wasn't a good way for a BPF scheduler to tell whether idle CPU selection has been skipped, ops.enqueue() couldn't insert tasks into foreign local DSQs, and the performance difference on machines with simple toplogies were minimal, sched_ext disabled ttwu_queue. However, this optimization makes noticeable difference on more complex topologies and a BPF scheduler now has an easy way tell whether ops.select_cpu() was skipped since 9b671793c7d9 ("sched_ext, scx_qmap: Add and use SCX_ENQ_CPU_SELECTED") and can insert tasks into foreign local DSQs since 5b26f7b920f7 ("sched_ext: Allow SCX_DSQ_LOCAL_ON for direct dispatches"). Implement SCX_OPS_ALLOW_QUEUED_WAKEUP which allows BPF schedulers to choose to enable ttwu_queue optimization. v2: Update the patch description and comment re. ops.select_cpu() being skipped in some cases as opposed to always as per Neel. Signed-off-by: Tejun Heo Reported-by: Neel Natu Reported-by: Barret Rhoden Cc: Peter Zijlstra (Intel) Acked-by: Andrea Righi --- kernel/sched/core.c | 9 ++------- kernel/sched/ext.c | 32 ++++++++++++++++++++++++++------ kernel/sched/ext.h | 10 ++++++++++ 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e77897a62442e..618bb0a5eb1c2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3921,13 +3921,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { - /* - * The BPF scheduler may depend on select_task_rq() being invoked during - * wakeups. In addition, @p may end up executing on a different CPU - * regardless of what happens in the wakeup path making the ttwu_queue - * optimization less meaningful. Skip if on SCX. - */ - if (task_on_scx(p)) + /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ + if (!scx_allow_ttwu_queue(p)) return false; /* diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 98d5f2f68f38c..2e1a1e4fc3044 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -96,7 +96,7 @@ enum scx_ops_flags { /* * Keep built-in idle tracking even if ops.update_idle() is implemented. */ - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, /* * By default, if there are no other task to run on the CPU, ext core @@ -104,7 +104,7 @@ enum scx_ops_flags { * flag is specified, such tasks are passed to ops.enqueue() with * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. */ - SCX_OPS_ENQ_LAST = 1LLU << 1, + SCX_OPS_ENQ_LAST = 1LLU << 1, /* * An exiting task may schedule after PF_EXITING is set. In such cases, @@ -117,13 +117,13 @@ enum scx_ops_flags { * depend on pid lookups and wants to handle these tasks directly, the * following flag can be used. */ - SCX_OPS_ENQ_EXITING = 1LLU << 2, + SCX_OPS_ENQ_EXITING = 1LLU << 2, /* * If set, only tasks with policy set to SCHED_EXT are attached to * sched_ext. If clear, SCHED_NORMAL tasks are also included. */ - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, /* * A migration disabled task can only execute on its current CPU. By @@ -136,7 +136,23 @@ enum scx_ops_flags { * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr * and thus may disagree with cpumask_weight(p->cpus_ptr). */ - SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes + * ops.enqueue() on the ops.select_cpu() selected or the wakee's + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline + * transfers. When this optimization is enabled, ops.select_cpu() is + * skipped in some cases (when racing against the wakee switching out). + * As the BPF scheduler may depend on ops.select_cpu() being invoked + * during wakeups, queued wakeup is disabled by default. + * + * If this ops flag is set, queued wakeup optimization is enabled and + * the BPF scheduler must be able to handle ops.enqueue() invoked on the + * wakee's CPU without preceding ops.select_cpu() even for tasks which + * may be executed on multiple CPUs. + */ + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, /* * CPU cgroup support flags @@ -147,6 +163,7 @@ enum scx_ops_flags { SCX_OPS_ENQ_LAST | SCX_OPS_ENQ_EXITING | SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_ALLOW_QUEUED_WAKEUP | SCX_OPS_SWITCH_PARTIAL | SCX_OPS_HAS_CGROUP_WEIGHT, }; @@ -897,6 +914,7 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all); static struct sched_ext_ops scx_ops; static bool scx_warned_zero_slice; +DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); @@ -4717,6 +4735,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) static_branch_disable(&__scx_ops_enabled); for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) static_branch_disable(&scx_has_op[i]); + static_branch_disable(&scx_ops_allow_queued_wakeup); static_branch_disable(&scx_ops_enq_last); static_branch_disable(&scx_ops_enq_exiting); static_branch_disable(&scx_ops_enq_migration_disabled); @@ -5348,9 +5367,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (((void (**)(void))ops)[i]) static_branch_enable(&scx_has_op[i]); + if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) + static_branch_enable(&scx_ops_allow_queued_wakeup); if (ops->flags & SCX_OPS_ENQ_LAST) static_branch_enable(&scx_ops_enq_last); - if (ops->flags & SCX_OPS_ENQ_EXITING) static_branch_enable(&scx_ops_enq_exiting); if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 1079b56b0f7ae..1bda96b19a1bf 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -8,6 +8,8 @@ */ #ifdef CONFIG_SCHED_CLASS_EXT +DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); + void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); @@ -34,6 +36,13 @@ static inline bool task_on_scx(const struct task_struct *p) return scx_enabled() && p->sched_class == &ext_sched_class; } +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) +{ + return !scx_enabled() || + static_branch_likely(&scx_ops_allow_queued_wakeup) || + p->sched_class != &ext_sched_class; +} + #ifdef CONFIG_SCHED_CORE bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, bool in_fi); @@ -52,6 +61,7 @@ static inline void scx_rq_activate(struct rq *rq) {} static inline void scx_rq_deactivate(struct rq *rq) {} static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } static inline bool task_on_scx(const struct task_struct *p) { return false; } +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; } static inline void init_sched_ext_class(void) {} #endif /* CONFIG_SCHED_CLASS_EXT */ -- GitLab From d2b239099cf019c0b7ebcb9ed6f398bf8be4f626 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 11 Feb 2025 07:19:01 +0100 Subject: [PATCH 237/934] docs: changes: update Sphinx minimal version to 3.4.3 Doing that allows us to get rid of all backward-compatible code. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Kees Cook Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/d79e357468c20d86913e9e343d785398f728aabb.1739254187.git.mchehab+huawei@kernel.org --- Documentation/conf.py | 2 +- Documentation/process/changes.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/conf.py b/Documentation/conf.py index 0c2205d536b38..3dad1f90b098b 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -47,7 +47,7 @@ from load_config import loadConfig # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -needs_sphinx = '2.4.4' +needs_sphinx = '3.4.3' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index a0beca805362d..66015c768cfca 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -58,7 +58,7 @@ mcelog 0.6 mcelog --version iptables 1.4.2 iptables -V openssl & libcrypto 1.0.0 openssl version bc 1.06.95 bc --version -Sphinx\ [#f1]_ 2.4.4 sphinx-build --version +Sphinx\ [#f1]_ 3.4.3 sphinx-build --version GNU tar 1.28 tar --version gtags (optional) 6.6.5 gtags --version mkimage (optional) 2017.01 mkimage --version -- GitLab From 5e25b972a22be2249af76b30831ccc62b88ad725 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 11 Feb 2025 07:19:02 +0100 Subject: [PATCH 238/934] docs: changes: update Python minimal version The current minimal version doesn't match what we have currently at the Kernel: $ vermin -v $(git ls-files *.py) ... Minimum required versions: 3.10 Incompatible versions: 2 Those are the Python scripts requiring versions higher than current minimal (3.5): !2, 3.10 tools/net/sunrpc/xdrgen/generators/__init__.py !2, 3.10 tools/net/sunrpc/xdrgen/generators/program.py !2, 3.10 tools/net/sunrpc/xdrgen/subcmds/source.py !2, 3.10 tools/net/sunrpc/xdrgen/xdr_ast.py !2, 3.10 tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py !2, 3.9 tools/testing/selftests/net/rds/test.py !2, 3.9 tools/net/ynl/ethtool.py !2, 3.9 tools/net/ynl/cli.py !2, 3.9 scripts/checktransupdate.py !2, 3.8 tools/testing/selftests/tc-testing/plugin-lib/nsPlugin.py !2, 3.8 tools/testing/selftests/hid/tests/base.py !2, 3.7 tools/testing/selftests/turbostat/smi_aperf_mperf.py !2, 3.7 tools/testing/selftests/turbostat/defcolumns.py !2, 3.7 tools/testing/selftests/turbostat/added_perf_counters.py !2, 3.7 tools/testing/selftests/hid/tests/conftest.py !2, 3.7 tools/testing/kunit/qemu_config.py !2, 3.7 tools/testing/kunit/kunit_tool_test.py !2, 3.7 tools/testing/kunit/kunit.py !2, 3.7 tools/testing/kunit/kunit_parser.py !2, 3.7 tools/testing/kunit/kunit_kernel.py !2, 3.7 tools/testing/kunit/kunit_json.py !2, 3.7 tools/testing/kunit/kunit_config.py !2, 3.7 tools/perf/scripts/python/gecko.py !2, 3.7 scripts/rust_is_available_test.py !2, 3.7 scripts/bpf_doc.py !2, 3.6 tools/writeback/wb_monitor.py !2, 3.6 tools/workqueue/wq_monitor.py !2, 3.6 tools/workqueue/wq_dump.py !2, 3.6 tools/usb/p9_fwd.py !2, 3.6 tools/tracing/rtla/sample/timerlat_load.py !2, 3.6 tools/testing/selftests/net/openvswitch/ovs-dpctl.py !2, 3.6 tools/testing/selftests/net/nl_netdev.py !2, 3.6 tools/testing/selftests/net/lib/py/ynl.py !2, 3.6 tools/testing/selftests/net/lib/py/utils.py !2, 3.6 tools/testing/selftests/net/lib/py/nsim.py !2, 3.6 tools/testing/selftests/net/lib/py/netns.py !2, 3.6 tools/testing/selftests/net/lib/py/ksft.py !2, 3.6 tools/testing/selftests/kselftest/ksft.py !2, 3.6 tools/testing/selftests/hid/tests/test_tablet.py !2, 3.6 tools/testing/selftests/hid/tests/test_sony.py !2, 3.6 tools/testing/selftests/hid/tests/test_multitouch.py !2, 3.6 tools/testing/selftests/hid/tests/test_mouse.py !2, 3.6 tools/testing/selftests/hid/tests/base_gamepad.py !2, 3.6 tools/testing/selftests/hid/tests/base_device.py !2, 3.6 tools/testing/selftests/drivers/net/stats.py !2, 3.6 tools/testing/selftests/drivers/net/shaper.py !2, 3.6 tools/testing/selftests/drivers/net/queues.py !2, 3.6 tools/testing/selftests/drivers/net/ping.py !2, 3.6 tools/testing/selftests/drivers/net/lib/py/remote_ssh.py !2, 3.6 tools/testing/selftests/drivers/net/lib/py/load.py !2, 3.6 tools/testing/selftests/drivers/net/lib/py/__init__.py !2, 3.6 tools/testing/selftests/drivers/net/lib/py/env.py !2, 3.6 tools/testing/selftests/drivers/net/hw/rss_ctx.py !2, 3.6 tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py !2, 3.6 tools/testing/selftests/drivers/net/hw/nic_performance.py !2, 3.6 tools/testing/selftests/drivers/net/hw/nic_link_layer.py !2, 3.6 tools/testing/selftests/drivers/net/hw/lib/py/linkconfig.py !2, 3.6 tools/testing/selftests/drivers/net/hw/lib/py/__init__.py !2, 3.6 tools/testing/selftests/drivers/net/hw/devmem.py !2, 3.6 tools/testing/selftests/drivers/net/hw/devlink_port_split.py !2, 3.6 tools/testing/selftests/drivers/net/hw/csum.py !2, 3.6 tools/testing/selftests/devices/probe/test_discoverable_devices.py !2, 3.6 tools/testing/selftests/bpf/test_bpftool_synctypes.py !2, 3.6 tools/testing/selftests/bpf/generate_udp_fragments.py !2, 3.6 tools/testing/kunit/run_checks.py !2, 3.6 tools/testing/kunit/kunit_printer.py !2, 3.6 tools/sched_ext/scx_show_state.py !2, 3.6 tools/perf/tests/shell/lib/perf_metric_validation.py !2, 3.6 tools/perf/tests/shell/lib/perf_json_output_lint.py !2, 3.6 tools/perf/scripts/python/parallel-perf.py !2, 3.6 tools/perf/scripts/python/flamegraph.py !2, 3.6 tools/perf/scripts/python/arm-cs-trace-disasm.py !2, 3.6 tools/perf/pmu-events/models.py !2, 3.6 tools/perf/pmu-events/metric_test.py !2, 3.6 tools/perf/pmu-events/metric.py !2, 3.6 tools/perf/pmu-events/jevents.py !2, 3.6 tools/net/ynl/ynl-gen-rst.py !2, 3.6 tools/net/ynl/ynl-gen-c.py !2, 3.6 tools/net/ynl/lib/ynl.py !2, 3.6 tools/net/ynl/lib/nlspec.py !2, 3.6 tools/crypto/tcrypt/tcrypt_speed_compare.py !2, 3.6 tools/cgroup/iocost_monitor.py !2, 3.6 tools/cgroup/iocost_coef_gen.py !2, 3.6 scripts/make_fit.py !2, 3.6 scripts/macro_checker.py !2, 3.6 scripts/get_abi.py !2, 3.6 scripts/generate_rust_analyzer.py !2, 3.6 scripts/gdb/linux/timerlist.py !2, 3.6 scripts/gdb/linux/pgtable.py !2, 3.6 scripts/clang-tools/run-clang-tools.py !2, 3.6 Documentation/sphinx/automarkup.py Even if we exclude tools/net/sunrpc/xdrgen/, the minimal version is Python 3.9. Update process/changes to reflect the current minimal version required to run Python scripts outside tools. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Kees Cook Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/34dda7a5a75f30380d95d8e85a8813be98dc72fe.1739254187.git.mchehab+huawei@kernel.org --- Documentation/process/changes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 66015c768cfca..d564362773b53 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -62,7 +62,7 @@ Sphinx\ [#f1]_ 3.4.3 sphinx-build --version GNU tar 1.28 tar --version gtags (optional) 6.6.5 gtags --version mkimage (optional) 2017.01 mkimage --version -Python (optional) 3.5.x python3 --version +Python (optional) 3.9.x python3 --version GNU AWK (optional) 5.1.0 gawk --version ====================== =============== ======================================== -- GitLab From 8def404249af7a623eb22b8bcfb7430127c1edb3 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 11 Feb 2025 07:19:03 +0100 Subject: [PATCH 239/934] docs: extensions: don't use utf-8 syntax for descriptions None of the descriptions at the Sphinx extensions are using non-ascii characters, so no need to place them under u""" notation. Also, according with: https://docs.python.org/3/deprecations/pending-removal-in-3.16.html the 'u' format code is scheduled to be removed in Python 3.16. So, let's just use """. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Kees Cook Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/8a42f6be53464af4b866492a7e9ddf29c0429997.1739254187.git.mchehab+huawei@kernel.org --- Documentation/sphinx/cdomain.py | 4 ++-- Documentation/sphinx/kernel_abi.py | 6 +++--- Documentation/sphinx/kernel_feat.py | 4 ++-- Documentation/sphinx/kernel_include.py | 4 ++-- Documentation/sphinx/kfigure.py | 10 +++++----- Documentation/sphinx/load_config.py | 2 +- Documentation/sphinx/maintainers_include.py | 4 ++-- Documentation/sphinx/rstFlatTable.py | 10 +++++----- 8 files changed, 22 insertions(+), 22 deletions(-) diff --git a/Documentation/sphinx/cdomain.py b/Documentation/sphinx/cdomain.py index e6959af25402d..6596fd00663f1 100644 --- a/Documentation/sphinx/cdomain.py +++ b/Documentation/sphinx/cdomain.py @@ -1,6 +1,6 @@ # -*- coding: utf-8; mode: python -*- # pylint: disable=W0141,C0113,C0103,C0325 -u""" +""" cdomain ~~~~~~~ @@ -145,7 +145,7 @@ class CObject(Base_CObject): } def handle_func_like_macro(self, sig, signode): - u"""Handles signatures of function-like macros. + """Handles signatures of function-like macros. If the objtype is 'function' and the signature ``sig`` is a function-like macro, the name of the macro is returned. Otherwise diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py index e017b02999532..db6f0380de94c 100644 --- a/Documentation/sphinx/kernel_abi.py +++ b/Documentation/sphinx/kernel_abi.py @@ -2,7 +2,7 @@ # coding=utf-8 # SPDX-License-Identifier: GPL-2.0 # -u""" +""" kernel-abi ~~~~~~~~~~ @@ -55,7 +55,7 @@ path = os.path.join(srctree, "Documentation/ABI") _kernel_abi = None def get_kernel_abi(): - u""" + """ Initialize kernel_abi global var, if not initialized yet. This is needed to avoid warnings during Sphinx module initialization. @@ -81,7 +81,7 @@ def setup(app): class KernelCmd(Directive): - u"""KernelABI (``kernel-abi``) directive""" + """KernelABI (``kernel-abi``) directive""" required_arguments = 1 optional_arguments = 3 diff --git a/Documentation/sphinx/kernel_feat.py b/Documentation/sphinx/kernel_feat.py index 03ace5f01b5c0..e3a51867f27bd 100644 --- a/Documentation/sphinx/kernel_feat.py +++ b/Documentation/sphinx/kernel_feat.py @@ -1,7 +1,7 @@ # coding=utf-8 # SPDX-License-Identifier: GPL-2.0 # -u""" +""" kernel-feat ~~~~~~~~~~~ @@ -56,7 +56,7 @@ def setup(app): class KernelFeat(Directive): - u"""KernelFeat (``kernel-feat``) directive""" + """KernelFeat (``kernel-feat``) directive""" required_arguments = 1 optional_arguments = 2 diff --git a/Documentation/sphinx/kernel_include.py b/Documentation/sphinx/kernel_include.py index 6387624423363..8db176045bc56 100755 --- a/Documentation/sphinx/kernel_include.py +++ b/Documentation/sphinx/kernel_include.py @@ -2,7 +2,7 @@ # -*- coding: utf-8; mode: python -*- # pylint: disable=R0903, C0330, R0914, R0912, E0401 -u""" +""" kernel-include ~~~~~~~~~~~~~~ @@ -56,7 +56,7 @@ def setup(app): class KernelInclude(Include): # ============================================================================== - u"""KernelInclude (``kernel-include``) directive""" + """KernelInclude (``kernel-include``) directive""" def run(self): env = self.state.document.settings.env diff --git a/Documentation/sphinx/kfigure.py b/Documentation/sphinx/kfigure.py index 383f9a695b082..f1a7f13c9c605 100644 --- a/Documentation/sphinx/kfigure.py +++ b/Documentation/sphinx/kfigure.py @@ -1,6 +1,6 @@ # -*- coding: utf-8; mode: python -*- # pylint: disable=C0103, R0903, R0912, R0915 -u""" +""" scalable figure and image handling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -165,7 +165,7 @@ def setup(app): def setupTools(app): - u""" + """ Check available build tools and log some *verbose* messages. This function is called once, when the builder is initiated. @@ -445,7 +445,7 @@ class kernel_image(nodes.image): pass class KernelImage(images.Image): - u"""KernelImage directive + """KernelImage directive Earns everything from ``.. image::`` directive, except *remote URI* and *glob* pattern. The KernelImage wraps a image node into a @@ -481,7 +481,7 @@ class kernel_figure(nodes.figure): """Node for ``kernel-figure`` directive.""" class KernelFigure(Figure): - u"""KernelImage directive + """KernelImage directive Earns everything from ``.. figure::`` directive, except *remote URI* and *glob* pattern. The KernelFigure wraps a figure node into a kernel_figure @@ -557,7 +557,7 @@ class kernel_render(nodes.General, nodes.Inline, nodes.Element): pass class KernelRender(Figure): - u"""KernelRender directive + """KernelRender directive Render content by external tool. Has all the options known from the *figure* directive, plus option ``caption``. If ``caption`` has a diff --git a/Documentation/sphinx/load_config.py b/Documentation/sphinx/load_config.py index 8b416bfd75ac1..ec50e1ee5223f 100644 --- a/Documentation/sphinx/load_config.py +++ b/Documentation/sphinx/load_config.py @@ -9,7 +9,7 @@ from sphinx.util.osutil import fs_encoding def loadConfig(namespace): # ------------------------------------------------------------------------------ - u"""Load an additional configuration file into *namespace*. + """Load an additional configuration file into *namespace*. The name of the configuration file is taken from the environment ``SPHINX_CONF``. The external configuration file extends (or overwrites) the diff --git a/Documentation/sphinx/maintainers_include.py b/Documentation/sphinx/maintainers_include.py index dcad0fff4723e..d31cff8674367 100755 --- a/Documentation/sphinx/maintainers_include.py +++ b/Documentation/sphinx/maintainers_include.py @@ -3,7 +3,7 @@ # -*- coding: utf-8; mode: python -*- # pylint: disable=R0903, C0330, R0914, R0912, E0401 -u""" +""" maintainers-include ~~~~~~~~~~~~~~~~~~~ @@ -37,7 +37,7 @@ def setup(app): ) class MaintainersInclude(Include): - u"""MaintainersInclude (``maintainers-include``) directive""" + """MaintainersInclude (``maintainers-include``) directive""" required_arguments = 0 def parse_maintainers(self, path): diff --git a/Documentation/sphinx/rstFlatTable.py b/Documentation/sphinx/rstFlatTable.py index 16bea0632555f..180fbb50c3374 100755 --- a/Documentation/sphinx/rstFlatTable.py +++ b/Documentation/sphinx/rstFlatTable.py @@ -2,7 +2,7 @@ # -*- coding: utf-8; mode: python -*- # pylint: disable=C0330, R0903, R0912 -u""" +""" flat-table ~~~~~~~~~~ @@ -99,7 +99,7 @@ class colSpan(nodes.General, nodes.Element): pass # pylint: disable=C0103,C0321 class FlatTable(Table): # ============================================================================== - u"""FlatTable (``flat-table``) directive""" + """FlatTable (``flat-table``) directive""" option_spec = { 'name': directives.unchanged @@ -135,7 +135,7 @@ class FlatTable(Table): class ListTableBuilder(object): # ============================================================================== - u"""Builds a table from a double-stage list""" + """Builds a table from a double-stage list""" def __init__(self, directive): self.directive = directive @@ -212,7 +212,7 @@ class ListTableBuilder(object): raise SystemMessagePropagation(error) def parseFlatTableNode(self, node): - u"""parses the node from a :py:class:`FlatTable` directive's body""" + """parses the node from a :py:class:`FlatTable` directive's body""" if len(node) != 1 or not isinstance(node[0], nodes.bullet_list): self.raiseError( @@ -225,7 +225,7 @@ class ListTableBuilder(object): self.roundOffTableDefinition() def roundOffTableDefinition(self): - u"""Round off the table definition. + """Round off the table definition. This method rounds off the table definition in :py:member:`rows`. -- GitLab From 089e06c3f113a8641a6cf502a34284a7c0ca1630 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 11 Feb 2025 07:19:04 +0100 Subject: [PATCH 240/934] scripts/kernel-doc: drop Sphinx version check As the current minimal supported Sphinx version is 3.4.3, drop support for older versions. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Kees Cook Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/0d002e7550476a68547ee53ad06cfd8fdcaf7c3a.1739254187.git.mchehab+huawei@kernel.org --- Documentation/sphinx/cdomain.py | 3 - Documentation/sphinx/kerneldoc.py | 5 -- scripts/kernel-doc | 129 ++++-------------------------- 3 files changed, 16 insertions(+), 121 deletions(-) diff --git a/Documentation/sphinx/cdomain.py b/Documentation/sphinx/cdomain.py index 6596fd00663f1..e8ea80d4324c1 100644 --- a/Documentation/sphinx/cdomain.py +++ b/Documentation/sphinx/cdomain.py @@ -45,9 +45,6 @@ import re __version__ = '1.1' -# Get Sphinx version -major, minor, patch = sphinx.version_info[:3] - # Namespace to be prepended to the full name namespace = None diff --git a/Documentation/sphinx/kerneldoc.py b/Documentation/sphinx/kerneldoc.py index be5b8fbf373ff..39ddae6ae7dd7 100644 --- a/Documentation/sphinx/kerneldoc.py +++ b/Documentation/sphinx/kerneldoc.py @@ -62,11 +62,6 @@ class KernelDocDirective(Directive): env = self.state.document.settings.env cmd = [env.config.kerneldoc_bin, '-rst', '-enable-lineno'] - # Pass the version string to kernel-doc, as it needs to use a different - # dialect, depending what the C domain supports for each specific - # Sphinx versions - cmd += ['-sphinx-version', sphinx.__version__] - filename = env.config.kerneldoc_srctree + '/' + self.arguments[0] export_file_patterns = [] diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 70da9a3369c6d..2c77b914d017e 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -26,7 +26,7 @@ kernel-doc - Print formatted kernel documentation to stdout kernel-doc [-h] [-v] [-Werror] [-Wall] [-Wreturn] [-Wshort-desc[ription]] [-Wcontents-before-sections] [ -man | - -rst [-sphinx-version VERSION] [-enable-lineno] | + -rst [-enable-lineno] | -none ] [ @@ -130,7 +130,6 @@ if ($#ARGV == -1) { } my $kernelversion; -my ($sphinx_major, $sphinx_minor, $sphinx_patch); my $dohighlight = ""; @@ -347,23 +346,6 @@ while ($ARGV[0] =~ m/^--?(.*)/) { $enable_lineno = 1; } elsif ($cmd eq 'show-not-found') { $show_not_found = 1; # A no-op but don't fail - } elsif ($cmd eq "sphinx-version") { - my $ver_string = shift @ARGV; - if ($ver_string =~ m/^(\d+)(\.\d+)?(\.\d+)?/) { - $sphinx_major = $1; - if (defined($2)) { - $sphinx_minor = substr($2,1); - } else { - $sphinx_minor = 0; - } - if (defined($3)) { - $sphinx_patch = substr($3,1) - } else { - $sphinx_patch = 0; - } - } else { - die "Sphinx version should either major.minor or major.minor.patch format\n"; - } } else { # Unknown argument pod2usage( @@ -387,8 +369,6 @@ while ($ARGV[0] =~ m/^--?(.*)/) { # continue execution near EOF; -# The C domain dialect changed on Sphinx 3. So, we need to check the -# version in order to produce the right tags. sub findprog($) { foreach(split(/:/, $ENV{PATH})) { @@ -396,42 +376,6 @@ sub findprog($) } } -sub get_sphinx_version() -{ - my $ver; - - my $cmd = "sphinx-build"; - if (!findprog($cmd)) { - my $cmd = "sphinx-build3"; - if (!findprog($cmd)) { - $sphinx_major = 1; - $sphinx_minor = 2; - $sphinx_patch = 0; - printf STDERR "Warning: Sphinx version not found. Using default (Sphinx version %d.%d.%d)\n", - $sphinx_major, $sphinx_minor, $sphinx_patch; - return; - } - } - - open IN, "$cmd --version 2>&1 |"; - while () { - if (m/^\s*sphinx-build\s+([\d]+)\.([\d\.]+)(\+\/[\da-f]+)?$/) { - $sphinx_major = $1; - $sphinx_minor = $2; - $sphinx_patch = $3; - last; - } - # Sphinx 1.2.x uses a different format - if (m/^\s*Sphinx.*\s+([\d]+)\.([\d\.]+)$/) { - $sphinx_major = $1; - $sphinx_minor = $2; - $sphinx_patch = $3; - last; - } - } - close IN; -} - # get kernel version from env sub get_kernel_version() { my $version = 'unknown kernel version'; @@ -859,9 +803,10 @@ sub output_function_rst(%) { $signature .= ")"; } - if ($sphinx_major < 3) { + if ($args{'typedef'} || $args{'functiontype'} eq "") { + print ".. c:macro:: ". $args{'function'} . "\n\n"; + if ($args{'typedef'}) { - print ".. c:type:: ". $args{'function'} . "\n\n"; print_lineno($declaration_start_line); print " **Typedef**: "; $lineprefix = ""; @@ -869,25 +814,10 @@ sub output_function_rst(%) { print "\n\n**Syntax**\n\n"; print " ``$signature``\n\n"; } else { - print ".. c:function:: $signature\n\n"; + print "``$signature``\n\n"; } } else { - if ($args{'typedef'} || $args{'functiontype'} eq "") { - print ".. c:macro:: ". $args{'function'} . "\n\n"; - - if ($args{'typedef'}) { - print_lineno($declaration_start_line); - print " **Typedef**: "; - $lineprefix = ""; - output_highlight_rst($args{'purpose'}); - print "\n\n**Syntax**\n\n"; - print " ``$signature``\n\n"; - } else { - print "``$signature``\n\n"; - } - } else { - print ".. c:function:: $signature\n\n"; - } + print ".. c:function:: $signature\n\n"; } if (!$args{'typedef'}) { @@ -955,13 +885,9 @@ sub output_enum_rst(%) { my $count; my $outer; - if ($sphinx_major < 3) { - my $name = "enum " . $args{'enum'}; - print "\n\n.. c:type:: " . $name . "\n\n"; - } else { - my $name = $args{'enum'}; - print "\n\n.. c:enum:: " . $name . "\n\n"; - } + my $name = $args{'enum'}; + print "\n\n.. c:enum:: " . $name . "\n\n"; + print_lineno($declaration_start_line); $lineprefix = " "; output_highlight_rst($args{'purpose'}); @@ -992,11 +918,8 @@ sub output_typedef_rst(%) { my $oldprefix = $lineprefix; my $name; - if ($sphinx_major < 3) { - $name = "typedef " . $args{'typedef'}; - } else { - $name = $args{'typedef'}; - } + $name = $args{'typedef'}; + print "\n\n.. c:type:: " . $name . "\n\n"; print_lineno($declaration_start_line); $lineprefix = " "; @@ -1012,17 +935,13 @@ sub output_struct_rst(%) { my ($parameter); my $oldprefix = $lineprefix; - if ($sphinx_major < 3) { - my $name = $args{'type'} . " " . $args{'struct'}; - print "\n\n.. c:type:: " . $name . "\n\n"; + my $name = $args{'struct'}; + if ($args{'type'} eq 'union') { + print "\n\n.. c:union:: " . $name . "\n\n"; } else { - my $name = $args{'struct'}; - if ($args{'type'} eq 'union') { - print "\n\n.. c:union:: " . $name . "\n\n"; - } else { - print "\n\n.. c:struct:: " . $name . "\n\n"; - } + print "\n\n.. c:struct:: " . $name . "\n\n"; } + print_lineno($declaration_start_line); $lineprefix = " "; output_highlight_rst($args{'purpose'}); @@ -2383,11 +2302,6 @@ sub process_file($) { close IN_FILE; } - -if ($output_mode eq "rst") { - get_sphinx_version() if (!$sphinx_major); -} - $kernelversion = get_kernel_version(); # generate a sequence of code that will splice in highlighting information @@ -2454,17 +2368,6 @@ Do not output documentation, only warnings. =head3 reStructuredText only -=over 8 - -=item -sphinx-version VERSION - -Use the ReST C domain dialect compatible with a specific Sphinx Version. - -If not specified, kernel-doc will auto-detect using the sphinx-build version -found on PATH. - -=back - =head2 Output selection (mutually exclusive): =over 8 -- GitLab From f553741ac8c0e467a3b873e305f34b902e50b86d Mon Sep 17 00:00:00 2001 From: zihan zhou <15645113830zzh@gmail.com> Date: Sat, 8 Feb 2025 16:08:52 +0800 Subject: [PATCH 241/934] sched: Cancel the slice protection of the idle entity A wakeup non-idle entity should preempt idle entity at any time, but because of the slice protection of the idle entity, the non-idle entity has to wait, so just cancel it. This patch is aimed at minimizing the impact of SCHED_IDLE on SCHED_NORMAL. For example, a task with SCHED_IDLE policy that sleeps for 1s and then runs for 3 ms, running cyclictest on the same cpu, has a maximum latency of 3 ms, which is caused by the slice protection of the idle entity. It is unreasonable. With this patch, the cyclictest latency under the same conditions is basically the same on the cpu with idle processes and on empty cpu. [peterz: add helpers] Fixes: 63304558ba5d ("sched/eevdf: Curb wakeup-preemption") Signed-off-by: zihan zhou <15645113830zzh@gmail.com> Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20250208080850.16300-1-15645113830zzh@gmail.com --- kernel/sched/fair.c | 46 ++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1c0ef435a7aae..61b826fa06805 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -883,6 +883,26 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) return __node_2_se(left); } +/* + * HACK, stash a copy of deadline at the point of pick in vlag, + * which isn't used until dequeue. + */ +static inline void set_protect_slice(struct sched_entity *se) +{ + se->vlag = se->deadline; +} + +static inline bool protect_slice(struct sched_entity *se) +{ + return se->vlag == se->deadline; +} + +static inline void cancel_protect_slice(struct sched_entity *se) +{ + if (protect_slice(se)) + se->vlag = se->deadline + 1; +} + /* * Earliest Eligible Virtual Deadline First * @@ -919,11 +939,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - /* - * Once selected, run a task until it either becomes non-eligible or - * until it gets a new slice. See the HACK in set_next_entity(). - */ - if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) + if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -5528,11 +5544,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - /* - * HACK, stash a copy of deadline at the point of pick in vlag, - * which isn't used until dequeue. - */ - se->vlag = se->deadline; + + set_protect_slice(se); } update_stats_curr_start(cfs_rq, se); @@ -8781,8 +8794,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Preempt an idle entity in favor of a non-idle entity (and don't preempt * in the inverse case). */ - if (cse_is_idle && !pse_is_idle) + if (cse_is_idle && !pse_is_idle) { + /* + * When non-idle entity preempt an idle entity, + * don't give idle entity slice protection. + */ + cancel_protect_slice(se); goto preempt; + } + if (cse_is_idle != pse_is_idle) return; @@ -8801,8 +8821,8 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Note that even if @p does not turn out to be the most eligible * task at this moment, current's slice protection will be lost. */ - if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) - se->vlag = se->deadline + 1; + if (do_preempt_short(cfs_rq, pse, se)) + cancel_protect_slice(se); /* * If @p has become the most eligible task, force preemption. -- GitLab From 2ae891b826958b60919ea21c727f77bcd6ffcc2c Mon Sep 17 00:00:00 2001 From: zihan zhou <15645113830zzh@gmail.com> Date: Sat, 8 Feb 2025 15:53:23 +0800 Subject: [PATCH 242/934] sched: Reduce the default slice to avoid tasks getting an extra tick The old default value for slice is 0.75 msec * (1 + ilog(ncpus)) which means that we have a default slice of: 0.75 for 1 cpu 1.50 up to 3 cpus 2.25 up to 7 cpus 3.00 for 8 cpus and above. For HZ=250 and HZ=100, because of the tick accuracy, the runtime of tasks is far higher than their slice. For HZ=1000 with 8 cpus or more, the accuracy of tick is already satisfactory, but there is still an issue that tasks will get an extra tick because the tick often arrives a little faster than expected. In this case, the task can only wait until the next tick to consider that it has reached its deadline, and will run 1ms longer. vruntime + sysctl_sched_base_slice = deadline |-----------|-----------|-----------|-----------| 1ms 1ms 1ms 1ms ^ ^ ^ ^ tick1 tick2 tick3 tick4(nearly 4ms) There are two reasons for tick error: clockevent precision and the CONFIG_IRQ_TIME_ACCOUNTING/CONFIG_PARAVIRT_TIME_ACCOUNTING. with CONFIG_IRQ_TIME_ACCOUNTING every tick will be less than 1ms, but even without it, because of clockevent precision, tick still often less than 1ms. In order to make scheduling more precise, we changed 0.75 to 0.70, Using 0.70 instead of 0.75 should not change much for other configs and would fix this issue: 0.70 for 1 cpu 1.40 up to 3 cpus 2.10 up to 7 cpus 2.8 for 8 cpus and above. This does not guarantee that tasks can run the slice time accurately every time, but occasionally running an extra tick has little impact. Signed-off-by: zihan zhou <15645113830zzh@gmail.com> Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20250208075322.13139-1-15645113830zzh@gmail.com --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 61b826fa06805..17847527aa202 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -74,10 +74,10 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_base_slice = 750000ULL; -static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; +unsigned int sysctl_sched_base_slice = 700000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -- GitLab From b9f2b29b94943b08157e3dfc970baabc7944dbc3 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 5 Feb 2025 11:24:38 +0800 Subject: [PATCH 243/934] sched: Don't define sched_clock_irqtime as static key MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sched_clock_irqtime was defined as a static key in commit 8722903cbb8f ('sched: Define sched_clock_irqtime as static key'). However, this change introduces a 'sleeping in atomic context' warning, as shown below: arch/x86/kernel/tsc.c:1214 mark_tsc_unstable() warn: sleeping in atomic context As analyzed by Dan, the affected code path is as follows: vcpu_load() <- disables preempt -> kvm_arch_vcpu_load() -> mark_tsc_unstable() <- sleeps virt/kvm/kvm_main.c 166 void vcpu_load(struct kvm_vcpu *vcpu) 167 { 168 int cpu = get_cpu(); ^^^^^^^^^^ This get_cpu() disables preemption. 169 170 __this_cpu_write(kvm_running_vcpu, vcpu); 171 preempt_notifier_register(&vcpu->preempt_notifier); 172 kvm_arch_vcpu_load(vcpu, cpu); 173 put_cpu(); 174 } arch/x86/kvm/x86.c 4979 if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) { 4980 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : 4981 rdtsc() - vcpu->arch.last_host_tsc; 4982 if (tsc_delta < 0) 4983 mark_tsc_unstable("KVM discovered backwards TSC"); arch/x86/kernel/tsc.c 1206 void mark_tsc_unstable(char *reason) 1207 { 1208 if (tsc_unstable) 1209 return; 1210 1211 tsc_unstable = 1; 1212 if (using_native_sched_clock()) 1213 clear_sched_clock_stable(); --> 1214 disable_sched_clock_irqtime(); ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ kernel/jump_label.c 245 void static_key_disable(struct static_key *key) 246 { 247 cpus_read_lock(); ^^^^^^^^^^^^^^^^ This lock has a might_sleep() in it which triggers the static checker warning. 248 static_key_disable_cpuslocked(key); 249 cpus_read_unlock(); 250 } Let revert this change for now as {disable,enable}_sched_clock_irqtime are used in many places, as pointed out by Sean, including the following: The code path in clocksource_watchdog(): clocksource_watchdog() | -> spin_lock(&watchdog_lock); | -> __clocksource_unstable() | -> clocksource.mark_unstable() == tsc_cs_mark_unstable() | -> disable_sched_clock_irqtime() And the code path in sched_clock_register(): /* Cannot register a sched_clock with interrupts on */ local_irq_save(flags); ... /* Enable IRQ time accounting if we have a fast enough sched_clock() */ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); local_irq_restore(flags); [lkp@intel.com: reported a build error in the prev version] Closes: https://lore.kernel.org/kvm/37a79ba3-9ce0-479c-a5b0-2bd75d573ed3@stanley.mountain/ Fixes: 8722903cbb8f ("sched: Define sched_clock_irqtime as static key") Reported-by: Dan Carpenter Debugged-by: Dan Carpenter Debugged-by: Sean Christopherson Debugged-by: Michal Koutný Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20250205032438.14668-1-laoar.shao@gmail.com --- kernel/sched/cputime.c | 8 ++++---- kernel/sched/sched.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5d9143dd08791..6dab4854c6c08 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -9,8 +9,6 @@ #ifdef CONFIG_IRQ_TIME_ACCOUNTING -DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); - /* * There are no locks covering percpu hardirq/softirq time. * They are only modified in vtime_account, on corresponding CPU @@ -24,14 +22,16 @@ DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); +int sched_clock_irqtime; + void enable_sched_clock_irqtime(void) { - static_branch_enable(&sched_clock_irqtime); + sched_clock_irqtime = 1; } void disable_sched_clock_irqtime(void) { - static_branch_disable(&sched_clock_irqtime); + sched_clock_irqtime = 0; } static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 38e0e323dda26..ab16d3d0e51cd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3259,11 +3259,11 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime); +extern int sched_clock_irqtime; static inline int irqtime_enabled(void) { - return static_branch_likely(&sched_clock_irqtime); + return sched_clock_irqtime; } /* -- GitLab From 563bc2161b94571ea425bbe2cf69fd38e24cdedf Mon Sep 17 00:00:00 2001 From: Tianchen Ding Date: Tue, 11 Feb 2025 14:36:59 +0800 Subject: [PATCH 244/934] sched/eevdf: Force propagating min_slice of cfs_rq when {en,de}queue tasks When a task is enqueued and its parent cgroup se is already on_rq, this parent cgroup se will not be enqueued again, and hence the root->min_slice leaves unchanged. The same issue happens when a task is dequeued and its parent cgroup se has other runnable entities, and the parent cgroup se will not be dequeued. Force propagating min_slice when se doesn't need to be enqueued or dequeued. Ensure the se hierarchy always get the latest min_slice. Fixes: aef6987d8954 ("sched/eevdf: Propagate min_slice up the cgroup hierarchy") Signed-off-by: Tianchen Ding Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250211063659.7180-1-dtcccc@linux.alibaba.com --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 17847527aa202..9279bfb740406 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7002,6 +7002,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_group(se); se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); cfs_rq->h_nr_runnable += h_nr_runnable; @@ -7131,6 +7133,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) update_cfs_group(se); se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); cfs_rq->h_nr_runnable -= h_nr_runnable; -- GitLab From d34e798094ca7be935b629a42f8b237d4d5b7f1d Mon Sep 17 00:00:00 2001 From: I Hsin Cheng Date: Mon, 10 Feb 2025 18:30:18 +0800 Subject: [PATCH 245/934] sched/fair: Refactor can_migrate_task() to elimate looping The function "can_migrate_task()" utilize "for_each_cpu_and" with a "if" statement inside to find the destination cpu. It's the same logic to find the first set bit of the result of the bitwise-AND of "env->dst_grpmask", "env->cpus" and "p->cpus_ptr". Refactor it by using "cpumask_first_and_and()" to perform bitwise-AND for "env->dst_grpmask", "env->cpus" and "p->cpus_ptr" and pick the first cpu within the intersection as the destination cpu, so we can elimate the need of looping and multiple times of branch. After the refactoring this part of the code can speed up from ~115ns to ~54ns, according to the test below. Ran the test for 5 times and the result is showned in the following table, and the test script is paste in next section. ------------------------------------------------------- |Old method| 130| 118| 115| 109| 106| avg ~115ns| ------------------------------------------------------- |New method| 58| 55| 54| 48| 55| avg ~54ns| ------------------------------------------------------- Signed-off-by: I Hsin Cheng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250210103019.283824-1-richard120310@gmail.com --- kernel/sched/fair.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9279bfb740406..857808da23d84 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9439,12 +9439,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; /* Prevent to re-select dst_cpu via env's CPUs: */ - for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) { - env->flags |= LBF_DST_PINNED; - env->new_dst_cpu = cpu; - break; - } + cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr); + + if (cpu < nr_cpu_ids) { + env->flags |= LBF_DST_PINNED; + env->new_dst_cpu = cpu; } return 0; -- GitLab From 1623ced247f7cb1b48a27cca6b0f17fe5ab5942b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 Feb 2025 21:34:12 +0200 Subject: [PATCH 246/934] x86/events/amd/iommu: Increase IOMMU_NAME_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The init_one_iommu() takes an unsigned int argument that can't be checked for the boundaries at compile time and GCC complains about that when build with `make W=1`: arch/x86/events/amd/iommu.c:441:53: note: directive argument in the range [0, 4294967294] arch/x86/events/amd/iommu.c:441:9: note: ‘snprintf’ output between 12 and 21 bytes into a destination of size 16 Increase the size to cover all possible cases. Signed-off-by: Andy Shevchenko Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250210193412.483233-1-andriy.shevchenko@linux.intel.com --- arch/x86/events/amd/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index b15f7b950d2e0..f8228d8243f7d 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -30,7 +30,7 @@ #define GET_DOMID_MASK(x) (((x)->conf1 >> 16) & 0xFFFFULL) #define GET_PASID_MASK(x) (((x)->conf1 >> 32) & 0xFFFFFULL) -#define IOMMU_NAME_SIZE 16 +#define IOMMU_NAME_SIZE 24 struct perf_amd_iommu { struct list_head list; -- GitLab From 2eeb03ad9f42dfece63051be2400af487ddb96d2 Mon Sep 17 00:00:00 2001 From: Saranya R Date: Wed, 12 Feb 2025 22:07:20 +0530 Subject: [PATCH 247/934] soc: qcom: pdr: Fix the potential deadlock When some client process A call pdr_add_lookup() to add the look up for the service and does schedule locator work, later a process B got a new server packet indicating locator is up and call pdr_locator_new_server() which eventually sets pdr->locator_init_complete to true which process A sees and takes list lock and queries domain list but it will timeout due to deadlock as the response will queued to the same qmi->wq and it is ordered workqueue and process B is not able to complete new server request work due to deadlock on list lock. Fix it by removing the unnecessary list iteration as the list iteration is already being done inside locator work, so avoid it here and just call schedule_work() here. Process A Process B process_scheduled_works() pdr_add_lookup() qmi_data_ready_work() process_scheduled_works() pdr_locator_new_server() pdr->locator_init_complete=true; pdr_locator_work() mutex_lock(&pdr->list_lock); pdr_locate_service() mutex_lock(&pdr->list_lock); pdr_get_domain_list() pr_err("PDR: %s get domain list txn wait failed: %d\n", req->service_name, ret); Timeout error log due to deadlock: " PDR: tms/servreg get domain list txn wait failed: -110 PDR: service lookup for msm/adsp/sensor_pd:tms/servreg failed: -110 " Thanks to Bjorn and Johan for letting me know that this commit also fixes an audio regression when using the in-kernel pd-mapper as that makes it easier to hit this race. [1] Link: https://lore.kernel.org/lkml/Zqet8iInnDhnxkT9@hovoldconsulting.com/ # [1] Fixes: fbe639b44a82 ("soc: qcom: Introduce Protection Domain Restart helpers") CC: stable@vger.kernel.org Reviewed-by: Bjorn Andersson Tested-by: Bjorn Andersson Tested-by: Johan Hovold Signed-off-by: Saranya R Co-developed-by: Mukesh Ojha Signed-off-by: Mukesh Ojha Link: https://lore.kernel.org/r/20250212163720.1577876-1-mukesh.ojha@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/pdr_interface.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/soc/qcom/pdr_interface.c b/drivers/soc/qcom/pdr_interface.c index 328b6153b2be6..71be378d2e43a 100644 --- a/drivers/soc/qcom/pdr_interface.c +++ b/drivers/soc/qcom/pdr_interface.c @@ -75,7 +75,6 @@ static int pdr_locator_new_server(struct qmi_handle *qmi, { struct pdr_handle *pdr = container_of(qmi, struct pdr_handle, locator_hdl); - struct pdr_service *pds; mutex_lock(&pdr->lock); /* Create a local client port for QMI communication */ @@ -87,12 +86,7 @@ static int pdr_locator_new_server(struct qmi_handle *qmi, mutex_unlock(&pdr->lock); /* Service pending lookup requests */ - mutex_lock(&pdr->list_lock); - list_for_each_entry(pds, &pdr->lookups, node) { - if (pds->need_locator_lookup) - schedule_work(&pdr->locator_work); - } - mutex_unlock(&pdr->list_lock); + schedule_work(&pdr->locator_work); return 0; } -- GitLab From ad3b301aa05af408462775368bd25d2a05409fe1 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Fri, 14 Feb 2025 18:57:36 +0900 Subject: [PATCH 248/934] sched_ext: Provides a sysfs 'events' to expose core event counters Add a sysfs entry at /sys/kernel/sched_ext/root/events to expose core event counters through the files system interface. Each line of the file shows the event name and its counter value. In addition, the format of scx_dump_event() is adjusted as the event name gets longer. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2e1a1e4fc3044..aec098efd6fbc 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1576,7 +1576,7 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu); * @kind: a kind of event to dump */ #define scx_dump_event(s, events, kind) do { \ - dump_line(&(s), "%30s: %16llu", #kind, (events)->kind); \ + dump_line(&(s), "%40s: %16llu", #kind, (events)->kind); \ } while (0) @@ -4383,8 +4383,33 @@ static ssize_t scx_attr_ops_show(struct kobject *kobj, } SCX_ATTR(ops); +#define scx_attr_event_show(buf, at, events, kind) ({ \ + sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ +}) + +static ssize_t scx_attr_events_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + struct scx_event_stats events; + int at = 0; + + scx_bpf_events(&events, sizeof(events)); + at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); + return at; +} +SCX_ATTR(events); + static struct attribute *scx_sched_attrs[] = { &scx_attr_ops.attr, + &scx_attr_events.attr, NULL, }; ATTRIBUTE_GROUPS(scx_sched); -- GitLab From f2c880fc8133e4607f9e0e22d08e5e2098c3604a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Feb 2025 08:46:20 -1000 Subject: [PATCH 249/934] tools/sched_ext: Sync with scx repo Synchronize with https://github.com/sched-ext/scx at d384453984a0 ("kernel: Sync at ad3b301aa05a ("sched_ext: Provides a sysfs 'events' to expose core event counters")"). Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/common.bpf.h | 29 +++++++++++++++++++++--- tools/sched_ext/scx_central.bpf.c | 2 -- tools/sched_ext/scx_central.c | 15 +++++++++--- 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index f1caf9fc8f8cc..77bbe0199a32c 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -7,6 +7,13 @@ #ifndef __SCX_COMMON_BPF_H #define __SCX_COMMON_BPF_H +/* + * The generated kfunc prototypes in vmlinux.h are missing address space + * attributes which cause build failures. For now, suppress the generated + * prototypes. See https://github.com/sched-ext/scx/issues/1111. + */ +#define BPF_NO_KFUNC_PROTOTYPES + #ifdef LSP #define __bpf__ #include "../vmlinux.h" @@ -78,10 +85,7 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; - void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; -#define scx_read_event(e, name) \ - (bpf_core_field_exists((e)->name) ? (e)->name : 0) /* * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from @@ -89,6 +93,9 @@ void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __ */ #define BPF_FOR_EACH_ITER (&___it) +#define scx_read_event(e, name) \ + (bpf_core_field_exists((e)->name) ? (e)->name : 0) + static inline __attribute__((format(printf, 1, 2))) void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} @@ -581,6 +588,22 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __u.__val; \ }) +#define READ_ONCE_ARENA(type, x) \ +({ \ + union { type __val; char __c[1]; } __u = \ + { .__c = { 0 } }; \ + __read_once_size((void *)&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +#define WRITE_ONCE_ARENA(type, x, val) \ +({ \ + union { type __val; char __c[1]; } __u = \ + { .__val = (val) }; \ + __write_once_size((void *)&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + /* * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. * @v: The value for which we're computing the base 2 logarithm. diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 5c165af1fa27c..50bc1737c167a 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -256,7 +256,6 @@ static int central_timerfn(void *map, int *key, struct bpf_timer *timer) u64 now = scx_bpf_now(); u64 nr_to_kick = nr_queued; s32 i, curr_cpu; - struct scx_event_stats events; curr_cpu = bpf_get_smp_processor_id(); if (timer_pinned && (curr_cpu != central_cpu)) { @@ -292,7 +291,6 @@ static int central_timerfn(void *map, int *key, struct bpf_timer *timer) bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); __sync_fetch_and_add(&nr_timers, 1); - return 0; } diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 1e9f74525d8fe..6ba6e610eeaa0 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -60,14 +61,22 @@ restart: skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); + assert(skel->rodata->nr_cpu_ids <= INT32_MAX); + while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; break; - case 'c': - skel->rodata->central_cpu = strtoul(optarg, NULL, 0); + case 'c': { + u32 central_cpu = strtoul(optarg, NULL, 0); + if (central_cpu >= skel->rodata->nr_cpu_ids) { + fprintf(stderr, "invalid central CPU id value, %u given (%u max)\n", central_cpu, skel->rodata->nr_cpu_ids); + return -1; + } + skel->rodata->central_cpu = (s32)central_cpu; break; + } case 'v': verbose = true; break; @@ -96,7 +105,7 @@ restart: */ cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); - CPU_ZERO(cpuset); + CPU_ZERO_S(CPU_ALLOC_SIZE(skel->rodata->nr_cpu_ids), cpuset); CPU_SET(skel->rodata->central_cpu, cpuset); SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset), "Failed to affinitize to central CPU %d (max %d)", -- GitLab From 7665054ee0dd0caba6f5f7bae48aa5f221218496 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Fri, 14 Feb 2025 20:40:00 +0100 Subject: [PATCH 250/934] nodemask: add nodes_copy() Nodemasks API misses the plain nodes_copy() which is required in this series. Signed-off-by: Yury Norov [NVIDIA] Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/nodemask.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 9fd7a0ce9c1a7..41cf43c4e70f2 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -191,6 +191,13 @@ static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *s bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } +#define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES) +static __always_inline void __nodes_copy(nodemask_t *dstp, + const nodemask_t *srcp, unsigned int nbits) +{ + bitmap_copy(dstp->bits, srcp->bits, nbits); +} + #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) static __always_inline void __nodes_complement(nodemask_t *dstp, -- GitLab From 14a8262f505bc4478c50e66309057bc0d0d4b62e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Fri, 14 Feb 2025 20:40:01 +0100 Subject: [PATCH 251/934] nodemask: numa: reorganize inclusion path Nodemasks now pull linux/numa.h for MAX_NUMNODES and NUMA_NO_NODE macros. This series makes numa.h depending on nodemasks, so we hit a circular dependency. Nodemasks library is highly employed by NUMA code, and it would be logical to resolve the circular dependency by making NUMA headers dependent nodemask.h. Signed-off-by: Yury Norov [NVIDIA] Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/nodemask.h | 1 - include/linux/nodemask_types.h | 11 ++++++++++- include/linux/numa.h | 10 +--------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 41cf43c4e70f2..f0ac0633366b9 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -94,7 +94,6 @@ #include #include #include -#include #include extern nodemask_t _unused_nodemask_arg_; diff --git a/include/linux/nodemask_types.h b/include/linux/nodemask_types.h index 6b28d97ea6ed0..f850a48742f1f 100644 --- a/include/linux/nodemask_types.h +++ b/include/linux/nodemask_types.h @@ -3,7 +3,16 @@ #define __LINUX_NODEMASK_TYPES_H #include -#include + +#ifdef CONFIG_NODES_SHIFT +#define NODES_SHIFT CONFIG_NODES_SHIFT +#else +#define NODES_SHIFT 0 +#endif + +#define MAX_NUMNODES (1 << NODES_SHIFT) + +#define NUMA_NO_NODE (-1) typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; diff --git a/include/linux/numa.h b/include/linux/numa.h index 3567e40329ebc..31d8bf8a951a7 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -3,16 +3,8 @@ #define _LINUX_NUMA_H #include #include +#include -#ifdef CONFIG_NODES_SHIFT -#define NODES_SHIFT CONFIG_NODES_SHIFT -#else -#define NODES_SHIFT 0 -#endif - -#define MAX_NUMNODES (1 << NODES_SHIFT) - -#define NUMA_NO_NODE (-1) #define NUMA_NO_MEMBLK (-1) static inline bool numa_valid_node(int nid) -- GitLab From 16d79f2a4f155b54e7580563c6c03832506b3b09 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 14 Feb 2025 20:40:02 +0100 Subject: [PATCH 252/934] mm/numa: Introduce nearest_node_nodemask() Introduce the new helper nearest_node_nodemask() to find the closest node in a specified nodemask from a given starting node. Returns MAX_NUMNODES if no node is found. Suggested-by: Yury Norov [NVIDIA] Signed-off-by: Andrea Righi Acked-by: Yury Norov [NVIDIA] Signed-off-by: Tejun Heo --- include/linux/numa.h | 7 +++++++ mm/mempolicy.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/include/linux/numa.h b/include/linux/numa.h index 31d8bf8a951a7..e6baaf6051bcf 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -31,6 +31,8 @@ void __init alloc_offline_node_data(int nid); /* Generic implementation available */ int numa_nearest_node(int node, unsigned int state); +int nearest_node_nodemask(int node, nodemask_t *mask); + #ifndef memory_add_physaddr_to_nid int memory_add_physaddr_to_nid(u64 start); #endif @@ -47,6 +49,11 @@ static inline int numa_nearest_node(int node, unsigned int state) return NUMA_NO_NODE; } +static inline int nearest_node_nodemask(int node, nodemask_t *mask) +{ + return NUMA_NO_NODE; +} + static inline int memory_add_physaddr_to_nid(u64 start) { return 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 162407fbf2bc7..488cad280efb3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -196,6 +196,37 @@ int numa_nearest_node(int node, unsigned int state) } EXPORT_SYMBOL_GPL(numa_nearest_node); +/** + * nearest_node_nodemask - Find the node in @mask at the nearest distance + * from @node. + * + * @node: a valid node ID to start the search from. + * @mask: a pointer to a nodemask representing the allowed nodes. + * + * This function iterates over all nodes in @mask and calculates the + * distance from the starting @node, then it returns the node ID that is + * the closest to @node, or MAX_NUMNODES if no node is found. + * + * Note that @node must be a valid node ID usable with node_distance(), + * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes + * or unexpected behavior. + */ +int nearest_node_nodemask(int node, nodemask_t *mask) +{ + int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; + + for_each_node_mask(n, *mask) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(nearest_node_nodemask); + struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; -- GitLab From f09177ca5f242a32368a2e9414dce4c90dc1d405 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 14 Feb 2025 20:40:03 +0100 Subject: [PATCH 253/934] sched/topology: Introduce for_each_node_numadist() iterator Introduce the new helper for_each_node_numadist() to iterate over node IDs in order of increasing NUMA distance from a given starting node. This iterator is somehow similar to for_each_numa_hop_mask(), but instead of providing a cpumask at each iteration, it provides a node ID. Example usage: nodemask_t unvisited = NODE_MASK_ALL; int node, start = cpu_to_node(smp_processor_id()); node = start; for_each_node_numadist(node, unvisited) pr_info("node (%d, %d) -> %d\n", start, node, node_distance(start, node)); On a system with equidistant nodes: $ numactl -H ... node distances: node 0 1 2 3 0: 10 20 20 20 1: 20 10 20 20 2: 20 20 10 20 3: 20 20 20 10 Output of the example above (on node 0): [ 7.367022] node (0, 0) -> 10 [ 7.367151] node (0, 1) -> 20 [ 7.367186] node (0, 2) -> 20 [ 7.367247] node (0, 3) -> 20 On a system with non-equidistant nodes (simulated using virtme-ng): $ numactl -H ... node distances: node 0 1 2 3 0: 10 51 31 41 1: 51 10 21 61 2: 31 21 10 11 3: 41 61 11 10 Output of the example above (on node 0): [ 8.953644] node (0, 0) -> 10 [ 8.953712] node (0, 2) -> 31 [ 8.953764] node (0, 3) -> 41 [ 8.953817] node (0, 1) -> 51 Suggested-by: Yury Norov [NVIDIA] Signed-off-by: Andrea Righi Acked-by: Yury Norov [NVIDIA] Signed-off-by: Tejun Heo --- include/linux/topology.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/include/linux/topology.h b/include/linux/topology.h index 52f5850730b3e..a1815f4395ab6 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -261,6 +261,36 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops) } #endif /* CONFIG_NUMA */ +/** + * for_each_node_numadist() - iterate over nodes in increasing distance + * order, starting from a given node + * @node: the iteration variable and the starting node. + * @unvisited: a nodemask to keep track of the unvisited nodes. + * + * This macro iterates over NUMA node IDs in increasing distance from the + * starting @node and yields MAX_NUMNODES when all the nodes have been + * visited. + * + * Note that by the time the loop completes, the @unvisited nodemask will + * be fully cleared, unless the loop exits early. + * + * The difference between for_each_node() and for_each_node_numadist() is + * that the former allows to iterate over nodes in numerical order, whereas + * the latter iterates over nodes in increasing order of distance. + * + * This complexity of this iterator is O(N^2), where N represents the + * number of nodes, as each iteration involves scanning all nodes to + * find the one with the shortest distance. + * + * Requires rcu_lock to be held. + */ +#define for_each_node_numadist(node, unvisited) \ + for (int __start = (node), \ + (node) = nearest_node_nodemask((__start), &(unvisited)); \ + (node) < MAX_NUMNODES; \ + node_clear((node), (unvisited)), \ + (node) = nearest_node_nodemask((__start), &(unvisited))) + /** * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance * from a given node. -- GitLab From d73249f88743df63a2bdce0e3153369963113710 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 14 Feb 2025 20:40:04 +0100 Subject: [PATCH 254/934] sched_ext: idle: Make idle static keys private Make all the static keys used by the idle CPU selection policy private to ext_idle.c. This avoids unnecessary exposure in headers and improves code encapsulation. Cc: Yury Norov Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 9 ++------- kernel/sched/ext_idle.c | 39 ++++++++++++++++++++++++++------------- kernel/sched/ext_idle.h | 12 ++++-------- 3 files changed, 32 insertions(+), 28 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index aec098efd6fbc..7c17e05ed15b1 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4765,7 +4765,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) static_branch_disable(&scx_ops_enq_exiting); static_branch_disable(&scx_ops_enq_migration_disabled); static_branch_disable(&scx_ops_cpu_preempt); - static_branch_disable(&scx_builtin_idle_enabled); + scx_idle_disable(); synchronize_rcu(); if (ei->kind >= SCX_EXIT_ERROR) { @@ -5403,12 +5403,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (scx_ops.cpu_acquire || scx_ops.cpu_release) static_branch_enable(&scx_ops_cpu_preempt); - if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { - scx_idle_reset_masks(); - static_branch_enable(&scx_builtin_idle_enabled); - } else { - static_branch_disable(&scx_builtin_idle_enabled); - } + scx_idle_enable(ops); /* * Lock out forks, cgroup on/offlining and moves before opening the diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index cb981956005b4..ed1804506585b 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -12,7 +12,7 @@ #include "ext_idle.h" /* Enable/disable built-in idle CPU selection policy */ -DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); #ifdef CONFIG_SMP #ifdef CONFIG_CPUMASK_OFFSTACK @@ -22,10 +22,10 @@ DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); #endif /* Enable/disable LLC aware optimizations */ -DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); /* Enable/disable NUMA aware optimizations */ -DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); static struct { cpumask_var_t cpu; @@ -441,16 +441,6 @@ cpu_found: return cpu; } -void scx_idle_reset_masks(void) -{ - /* - * Consider all online cpus idle. Should converge to the actual state - * quickly. - */ - cpumask_copy(idle_masks.cpu, cpu_online_mask); - cpumask_copy(idle_masks.smt, cpu_online_mask); -} - void scx_idle_init_masks(void) { BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); @@ -532,6 +522,29 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) } #endif /* CONFIG_SMP */ +void scx_idle_enable(struct sched_ext_ops *ops) +{ + if (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { + static_branch_disable(&scx_builtin_idle_enabled); + return; + } + static_branch_enable(&scx_builtin_idle_enabled); + +#ifdef CONFIG_SMP + /* + * Consider all online cpus idle. Should converge to the actual state + * quickly. + */ + cpumask_copy(idle_masks.cpu, cpu_online_mask); + cpumask_copy(idle_masks.smt, cpu_online_mask); +#endif +} + +void scx_idle_disable(void) +{ + static_branch_disable(&scx_builtin_idle_enabled); +} + /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index 7a13a74815ba7..bbac0fd9a5ddd 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -10,20 +10,15 @@ #ifndef _KERNEL_SCHED_EXT_IDLE_H #define _KERNEL_SCHED_EXT_IDLE_H -extern struct static_key_false scx_builtin_idle_enabled; +struct sched_ext_ops; #ifdef CONFIG_SMP -extern struct static_key_false scx_selcpu_topo_llc; -extern struct static_key_false scx_selcpu_topo_numa; - void scx_idle_update_selcpu_topology(void); -void scx_idle_reset_masks(void); void scx_idle_init_masks(void); bool scx_idle_test_and_clear_cpu(int cpu); s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags); #else /* !CONFIG_SMP */ static inline void scx_idle_update_selcpu_topology(void) {} -static inline void scx_idle_reset_masks(void) {} static inline void scx_idle_init_masks(void) {} static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) @@ -33,7 +28,8 @@ static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flag #endif /* CONFIG_SMP */ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *found); - -extern int scx_idle_init(void); +void scx_idle_enable(struct sched_ext_ops *ops); +void scx_idle_disable(void); +int scx_idle_init(void); #endif /* _KERNEL_SCHED_EXT_IDLE_H */ -- GitLab From 0aaaf89df86da28fa4928faa55a529beac1d6cab Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 14 Feb 2025 20:40:05 +0100 Subject: [PATCH 255/934] sched_ext: idle: Introduce SCX_OPS_BUILTIN_IDLE_PER_NODE Add the new scheduler flag SCX_OPS_BUILTIN_IDLE_PER_NODE, which allows BPF schedulers to select between using a global flat idle cpumask or multiple per-node cpumasks. This only introduces the flag and the mechanism to enable/disable this feature without affecting any scheduling behavior. Cc: Yury Norov [NVIDIA] Signed-off-by: Andrea Righi Reviewed-by: Yury Norov [NVIDIA] Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 21 ++++++++++++++++++-- kernel/sched/ext_idle.c | 29 +++++++++++++++++++++------- kernel/sched/ext_idle.h | 4 ++-- tools/sched_ext/include/scx/compat.h | 3 +++ 4 files changed, 46 insertions(+), 11 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7c17e05ed15b1..330a359d79301 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -154,6 +154,12 @@ enum scx_ops_flags { */ SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, + /* + * If set, enable per-node idle cpumasks. If clear, use a single global + * flat idle cpumask. + */ + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, + /* * CPU cgroup support flags */ @@ -165,6 +171,7 @@ enum scx_ops_flags { SCX_OPS_ENQ_MIGRATION_DISABLED | SCX_OPS_ALLOW_QUEUED_WAKEUP | SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_BUILTIN_IDLE_PER_NODE | SCX_OPS_HAS_CGROUP_WEIGHT, }; @@ -3427,7 +3434,7 @@ static void handle_hotplug(struct rq *rq, bool online) atomic_long_inc(&scx_hotplug_seq); if (scx_enabled()) - scx_idle_update_selcpu_topology(); + scx_idle_update_selcpu_topology(&scx_ops); if (online && SCX_HAS_OP(cpu_online)) SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); @@ -5228,6 +5235,16 @@ static int validate_ops(const struct sched_ext_ops *ops) return -EINVAL; } + /* + * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle + * selection policy to be enabled. + */ + if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && + (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); + return -EINVAL; + } + return 0; } @@ -5352,7 +5369,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) static_branch_enable_cpuslocked(&scx_has_op[i]); check_hotplug_seq(ops); - scx_idle_update_selcpu_topology(); + scx_idle_update_selcpu_topology(ops); cpus_read_unlock(); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index ed1804506585b..0912f94b95cdc 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -14,6 +14,9 @@ /* Enable/disable built-in idle CPU selection policy */ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); +/* Enable/disable per-node idle cpumasks */ +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node); + #ifdef CONFIG_SMP #ifdef CONFIG_CPUMASK_OFFSTACK #define CL_ALIGNED_IF_ONSTACK @@ -204,7 +207,7 @@ static bool llc_numa_mismatch(void) * CPU belongs to a single LLC domain, and that each LLC domain is entirely * contained within a single NUMA node. */ -void scx_idle_update_selcpu_topology(void) +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) { bool enable_llc = false, enable_numa = false; unsigned int nr_cpus; @@ -237,13 +240,19 @@ void scx_idle_update_selcpu_topology(void) * If all CPUs belong to the same NUMA node and the same LLC domain, * enabling both NUMA and LLC optimizations is unnecessary, as checking * for an idle CPU in the same domain twice is redundant. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA + * optimization, as we would naturally select idle CPUs within + * specific NUMA nodes querying the corresponding per-node cpumask. */ - nr_cpus = numa_weight(cpu); - if (nr_cpus > 0) { - if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) - enable_numa = true; - pr_debug("sched_ext: NUMA=%*pb weight=%u\n", - cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + nr_cpus = numa_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) + enable_numa = true; + pr_debug("sched_ext: NUMA=%*pb weight=%u\n", + cpumask_pr_args(numa_span(cpu)), nr_cpus); + } } rcu_read_unlock(); @@ -530,6 +539,11 @@ void scx_idle_enable(struct sched_ext_ops *ops) } static_branch_enable(&scx_builtin_idle_enabled); + if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) + static_branch_enable(&scx_builtin_idle_per_node); + else + static_branch_disable(&scx_builtin_idle_per_node); + #ifdef CONFIG_SMP /* * Consider all online cpus idle. Should converge to the actual state @@ -543,6 +557,7 @@ void scx_idle_enable(struct sched_ext_ops *ops) void scx_idle_disable(void) { static_branch_disable(&scx_builtin_idle_enabled); + static_branch_disable(&scx_builtin_idle_per_node); } /******************************************************************************** diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index bbac0fd9a5ddd..339b6ec9c4cb7 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -13,12 +13,12 @@ struct sched_ext_ops; #ifdef CONFIG_SMP -void scx_idle_update_selcpu_topology(void); +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); void scx_idle_init_masks(void); bool scx_idle_test_and_clear_cpu(int cpu); s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags); #else /* !CONFIG_SMP */ -static inline void scx_idle_update_selcpu_topology(void) {} +static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {} static inline void scx_idle_init_masks(void) {} static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index b50280e2ba2ba..d63cf40be8eee 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -109,6 +109,9 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field #define SCX_OPS_SWITCH_PARTIAL \ __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") +#define SCX_OPS_BUILTIN_IDLE_PER_NODE \ + __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_BUILTIN_IDLE_PER_NODE") + static inline long scx_hotplug_seq(void) { int fd; -- GitLab From 48849271e66114cb980a3bc44218b04d0f8cdcdd Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 14 Feb 2025 20:40:06 +0100 Subject: [PATCH 256/934] sched_ext: idle: Per-node idle cpumasks Using a single global idle mask can lead to inefficiencies and a lot of stress on the cache coherency protocol on large systems with multiple NUMA nodes, since all the CPUs can create a really intense read/write activity on the single global cpumask. Therefore, split the global cpumask into multiple per-NUMA node cpumasks to improve scalability and performance on large systems. The concept is that each cpumask will track only the idle CPUs within its corresponding NUMA node, treating CPUs in other NUMA nodes as busy. In this way concurrent access to the idle cpumask will be restricted within each NUMA node. The split of multiple per-node idle cpumasks can be controlled using the SCX_OPS_BUILTIN_IDLE_PER_NODE flag. By default SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled and a global host-wide idle cpumask is used, maintaining the previous behavior. NOTE: if a scheduler explicitly enables the per-node idle cpumasks (via SCX_OPS_BUILTIN_IDLE_PER_NODE), scx_bpf_get_idle_cpu/smtmask() will trigger an scx error, since there are no system-wide cpumasks. = Test = Hardware: - System: DGX B200 - CPUs: 224 SMT threads (112 physical cores) - Processor: INTEL(R) XEON(R) PLATINUM 8570 - 2 NUMA nodes Scheduler: - scx_simple [1] (so that we can focus at the built-in idle selection policy and not at the scheduling policy itself) Test: - Run a parallel kernel build `make -j $(nproc)` and measure the average elapsed time over 10 runs: avg time | stdev ---------+------ before: 52.431s | 2.895 after: 50.342s | 2.895 = Conclusion = Splitting the global cpumask into multiple per-NUMA cpumasks helped to achieve a speedup of approximately +4% with this particular architecture and test case. The same test on a DGX-1 (40 physical cores, Intel Xeon E5-2698 v4 @ 2.20GHz, 2 NUMA nodes) shows a speedup of around 1.5-3%. On smaller systems, I haven't noticed any measurable regressions or improvements with the same test (parallel kernel build) and scheduler (scx_simple). Moreover, with a modified scx_bpfland that uses the new NUMA-aware APIs I observed an additional +2-2.5% performance improvement with the same test. [1] https://github.com/sched-ext/scx/blob/main/scheds/c/scx_simple.bpf.c Cc: Yury Norov [NVIDIA] Signed-off-by: Andrea Righi Reviewed-by: Yury Norov [NVIDIA] Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 1 + kernel/sched/ext_idle.c | 283 ++++++++++++++++++++++----- kernel/sched/ext_idle.h | 4 +- tools/sched_ext/include/scx/compat.h | 3 + 4 files changed, 236 insertions(+), 55 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 330a359d79301..95603db36f043 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -806,6 +806,7 @@ enum scx_deq_flags { enum scx_pick_idle_cpu_flags { SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ }; enum scx_kick_flags { diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 0912f94b95cdc..8dacccc82ed63 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -18,25 +18,61 @@ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node); #ifdef CONFIG_SMP -#ifdef CONFIG_CPUMASK_OFFSTACK -#define CL_ALIGNED_IF_ONSTACK -#else -#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp -#endif - /* Enable/disable LLC aware optimizations */ static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); /* Enable/disable NUMA aware optimizations */ static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); -static struct { +/* + * cpumasks to track idle CPUs within each NUMA node. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask + * from is used to track all the idle CPUs in the system. + */ +struct scx_idle_cpus { cpumask_var_t cpu; cpumask_var_t smt; -} idle_masks CL_ALIGNED_IF_ONSTACK; +}; + +/* + * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE + * is not enabled). + */ +static struct scx_idle_cpus scx_idle_global_masks; + +/* + * Per-node idle cpumasks. + */ +static struct scx_idle_cpus **scx_idle_node_masks; + +/* + * Return the idle masks associated to a target @node. + * + * NUMA_NO_NODE identifies the global idle cpumask. + */ +static struct scx_idle_cpus *idle_cpumask(int node) +{ + return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node]; +} + +/* + * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if + * per-node idle cpumasks are disabled. + */ +static int scx_cpu_node_if_enabled(int cpu) +{ + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +} bool scx_idle_test_and_clear_cpu(int cpu) { + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + #ifdef CONFIG_SCHED_SMT /* * SMT mask should be cleared whether we can claim @cpu or not. The SMT @@ -45,33 +81,38 @@ bool scx_idle_test_and_clear_cpu(int cpu) */ if (sched_smt_active()) { const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; /* * If offline, @cpu is not its own sibling and * scx_pick_idle_cpu() can get caught in an infinite loop as - * @cpu is never cleared from idle_masks.smt. Ensure that @cpu - * is eventually cleared. + * @cpu is never cleared from the idle SMT mask. Ensure that + * @cpu is eventually cleared. * * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to * reduce memory writes, which may help alleviate cache * coherence pressure. */ - if (cpumask_intersects(smt, idle_masks.smt)) - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); - else if (cpumask_test_cpu(cpu, idle_masks.smt)) - __cpumask_clear_cpu(cpu, idle_masks.smt); + if (cpumask_intersects(smt, idle_smts)) + cpumask_andnot(idle_smts, idle_smts, smt); + else if (cpumask_test_cpu(cpu, idle_smts)) + __cpumask_clear_cpu(cpu, idle_smts); } #endif - return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); + + return cpumask_test_and_clear_cpu(cpu, idle_cpus); } -s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) +/* + * Pick an idle CPU in a specific NUMA node. + */ +static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags) { int cpu; retry: if (sched_smt_active()) { - cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); + cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed); if (cpu < nr_cpu_ids) goto found; @@ -79,7 +120,7 @@ retry: return -EBUSY; } - cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); + cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed); if (cpu >= nr_cpu_ids) return -EBUSY; @@ -90,6 +131,85 @@ found: goto retry; } +/* + * Tracks nodes that have not yet been visited when searching for an idle + * CPU across all available nodes. + */ +static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited); + +/* + * Search for an idle CPU across all nodes, excluding @node. + */ +static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + nodemask_t *unvisited; + s32 cpu = -EBUSY; + + preempt_disable(); + unvisited = this_cpu_ptr(&per_cpu_unvisited); + + /* + * Restrict the search to the online nodes (excluding the current + * node that has been visited already). + */ + nodes_copy(*unvisited, node_states[N_ONLINE]); + node_clear(node, *unvisited); + + /* + * Traverse all nodes in order of increasing distance, starting + * from @node. + * + * This loop is O(N^2), with N being the amount of NUMA nodes, + * which might be quite expensive in large NUMA systems. However, + * this complexity comes into play only when a scheduler enables + * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU + * without specifying a target NUMA node, so it shouldn't be a + * bottleneck is most cases. + * + * As a future optimization we may want to cache the list of nodes + * in a per-node array, instead of actually traversing them every + * time. + */ + for_each_node_numadist(node, *unvisited) { + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + break; + } + preempt_enable(); + + return cpu; +} + +/* + * Find an idle CPU in the system, starting from @node. + */ +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + s32 cpu; + + /* + * Always search in the starting node first (this is an + * optimization that can save some cycles even when the search is + * not limited to a single node). + */ + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + /* + * Stop the search if we are using only a single global cpumask + * (NUMA_NO_NODE) or if the search is restricted to the first node + * only. + */ + if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE) + return -EBUSY; + + /* + * Extend the search to the other online nodes. + */ + return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags); +} + /* * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC * domain is not defined). @@ -302,6 +422,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool { const struct cpumask *llc_cpus = NULL; const struct cpumask *numa_cpus = NULL; + int node = scx_cpu_node_if_enabled(prev_cpu); s32 cpu; *found = false; @@ -359,9 +480,9 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool * piled up on it even if there is an idle core elsewhere on * the system. */ - if (!cpumask_empty(idle_masks.cpu) && - !(current->flags & PF_EXITING) && - cpu_rq(cpu)->scx.local_dsq.nr == 0) { + if (!(current->flags & PF_EXITING) && + cpu_rq(cpu)->scx.local_dsq.nr == 0 && + !cpumask_empty(idle_cpumask(cpu_to_node(cpu))->cpu)) { if (cpumask_test_cpu(cpu, p->cpus_ptr)) goto cpu_found; } @@ -375,7 +496,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool /* * Keep using @prev_cpu if it's part of a fully idle core. */ - if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && + if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && scx_idle_test_and_clear_cpu(prev_cpu)) { cpu = prev_cpu; goto cpu_found; @@ -385,7 +506,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool * Search for any fully idle core in the same LLC domain. */ if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); + cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE); if (cpu >= 0) goto cpu_found; } @@ -394,15 +515,19 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool * Search for any fully idle core in the same NUMA node. */ if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); + cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE); if (cpu >= 0) goto cpu_found; } /* * Search for any full idle core usable by the task. + * + * If NUMA aware idle selection is enabled, the search will + * begin in prev_cpu's node and proceed to other nodes in + * order of increasing distance. */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); + cpu = scx_pick_idle_cpu(p->cpus_ptr, node, SCX_PICK_IDLE_CORE); if (cpu >= 0) goto cpu_found; } @@ -419,7 +544,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool * Search for any idle CPU in the same LLC domain. */ if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, 0); + cpu = pick_idle_cpu_in_node(llc_cpus, node, 0); if (cpu >= 0) goto cpu_found; } @@ -428,7 +553,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool * Search for any idle CPU in the same NUMA node. */ if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, 0); + cpu = pick_idle_cpu_in_node(numa_cpus, node, 0); if (cpu >= 0) goto cpu_found; } @@ -436,7 +561,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool /* * Search for any idle CPU usable by the task. */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); + cpu = scx_pick_idle_cpu(p->cpus_ptr, node, 0); if (cpu >= 0) goto cpu_found; @@ -450,30 +575,54 @@ cpu_found: return cpu; } +/* + * Initialize global and per-node idle cpumasks. + */ void scx_idle_init_masks(void) { - BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); + int node; + + /* Allocate global idle cpumasks */ + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL)); + + /* Allocate per-node idle cpumasks */ + scx_idle_node_masks = kcalloc(num_possible_nodes(), + sizeof(*scx_idle_node_masks), GFP_KERNEL); + BUG_ON(!scx_idle_node_masks); + + for_each_node(node) { + scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks), + GFP_KERNEL, node); + BUG_ON(!scx_idle_node_masks[node]); + + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node)); + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node)); + } } static void update_builtin_idle(int cpu, bool idle) { - assign_cpu(cpu, idle_masks.cpu, idle); + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + + assign_cpu(cpu, idle_cpus, idle); #ifdef CONFIG_SCHED_SMT if (sched_smt_active()) { const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; if (idle) { /* - * idle_masks.smt handling is racy but that's fine as - * it's only for optimization and self-correcting. + * idle_smt handling is racy but that's fine as it's + * only for optimization and self-correcting. */ - if (!cpumask_subset(smt, idle_masks.cpu)) + if (!cpumask_subset(smt, idle_cpus)) return; - cpumask_or(idle_masks.smt, idle_masks.smt, smt); + cpumask_or(idle_smts, idle_smts, smt); } else { - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); + cpumask_andnot(idle_smts, idle_smts, smt); } } #endif @@ -529,15 +678,36 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) if (do_notify || is_idle_task(rq->curr)) update_builtin_idle(cpu, idle); } + +static void reset_idle_masks(struct sched_ext_ops *ops) +{ + int node; + + /* + * Consider all online cpus idle. Should converge to the actual state + * quickly. + */ + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask); + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask); + return; + } + + for_each_node(node) { + const struct cpumask *node_mask = cpumask_of_node(node); + + cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask); + cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); + } +} #endif /* CONFIG_SMP */ void scx_idle_enable(struct sched_ext_ops *ops) { - if (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) + static_branch_enable(&scx_builtin_idle_enabled); + else static_branch_disable(&scx_builtin_idle_enabled); - return; - } - static_branch_enable(&scx_builtin_idle_enabled); if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) static_branch_enable(&scx_builtin_idle_per_node); @@ -545,12 +715,7 @@ void scx_idle_enable(struct sched_ext_ops *ops) static_branch_disable(&scx_builtin_idle_per_node); #ifdef CONFIG_SMP - /* - * Consider all online cpus idle. Should converge to the actual state - * quickly. - */ - cpumask_copy(idle_masks.cpu, cpu_online_mask); - cpumask_copy(idle_masks.smt, cpu_online_mask); + reset_idle_masks(ops); #endif } @@ -610,15 +775,21 @@ prev_cpu: * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking * per-CPU cpumask. * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) { + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + if (!check_builtin_idle_enabled()) return cpu_none_mask; #ifdef CONFIG_SMP - return idle_masks.cpu; + return idle_cpumask(NUMA_NO_NODE)->cpu; #else return cpu_none_mask; #endif @@ -629,18 +800,24 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) * per-physical-core cpumask. Can be used to determine if an entire physical * core is free. * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) { + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + if (!check_builtin_idle_enabled()) return cpu_none_mask; #ifdef CONFIG_SMP if (sched_smt_active()) - return idle_masks.smt; + return idle_cpumask(NUMA_NO_NODE)->smt; else - return idle_masks.cpu; + return idle_cpumask(NUMA_NO_NODE)->cpu; #else return cpu_none_mask; #endif @@ -707,7 +884,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, if (!check_builtin_idle_enabled()) return -EBUSY; - return scx_pick_idle_cpu(cpus_allowed, flags); + return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); } /** @@ -730,7 +907,7 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, s32 cpu; if (static_branch_likely(&scx_builtin_idle_enabled)) { - cpu = scx_pick_idle_cpu(cpus_allowed, flags); + cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); if (cpu >= 0) return cpu; } diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index 339b6ec9c4cb7..68c4307ce4f6f 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -16,12 +16,12 @@ struct sched_ext_ops; void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); void scx_idle_init_masks(void); bool scx_idle_test_and_clear_cpu(int cpu); -s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags); +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags); #else /* !CONFIG_SMP */ static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {} static inline void scx_idle_init_masks(void) {} static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } -static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) +static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) { return -EBUSY; } diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index d63cf40be8eee..03e06bd15c738 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -112,6 +112,9 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field #define SCX_OPS_BUILTIN_IDLE_PER_NODE \ __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_BUILTIN_IDLE_PER_NODE") +#define SCX_PICK_IDLE_IN_NODE \ + __COMPAT_ENUM_OR_ZERO("scx_pick_idle_cpu_flags", "SCX_PICK_IDLE_IN_NODE") + static inline long scx_hotplug_seq(void) { int fd; -- GitLab From d20610c19b4a22bc69085b7eb7a02741d51de30e Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 5 Feb 2025 06:05:41 +0000 Subject: [PATCH 257/934] perf/amd/ibs: Add support for OP Load Latency Filtering IBS Op PMU on Zen5 uarch added new Load Latency filtering capability. It's advertised by CPUID_Fn8000001B_EAX bit 12. When enabled, IBS HW will raise interrupt only for sample that had an IbsDcMissLat value greater than N cycles, where N is a programmable value defined as multiples of 128 (i.e. 128, 256, 384 etc.) from 128-2048 cycles. Similar to L3MissOnly, IBS HW internally drops the sample and restarts if the sample does not meet the filtering criteria. Add support for LdLat filtering in IBS Op PMU. Since hardware supports threshold in multiple of 128, add a software filter on top to support latency threshold with the granularity of 1 cycle between [128-2048]. Example usage: # perf record -a -e ibs_op/ldlat=128/ -- sleep 5 Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250205060547.1337-2-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 93 ++++++++++++++++++++++++++++--- arch/x86/include/asm/amd-ibs.h | 3 +- arch/x86/include/asm/perf_event.h | 3 + 3 files changed, 90 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 7978d7910adc3..85b29b3f305b2 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -268,6 +268,14 @@ static int validate_group(struct perf_event *event) return 0; } +static bool perf_ibs_ldlat_event(struct perf_ibs *perf_ibs, + struct perf_event *event) +{ + return perf_ibs == &perf_ibs_op && + (ibs_caps & IBS_CAPS_OPLDLAT) && + (event->attr.config1 & 0xFFF); +} + static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -339,6 +347,17 @@ static int perf_ibs_init(struct perf_event *event) return -EINVAL; } + if (perf_ibs_ldlat_event(perf_ibs, event)) { + u64 ldlat = event->attr.config1 & 0xFFF; + + if (ldlat < 128 || ldlat > 2048) + return -EINVAL; + ldlat >>= 7; + + config |= (ldlat - 1) << 59; + config |= IBS_OP_L3MISSONLY | IBS_OP_LDLAT_EN; + } + /* * If we modify hwc->sample_period, we also need to update * hwc->last_period and hwc->period_left. @@ -607,7 +626,9 @@ PMU_FORMAT_ATTR(cnt_ctl, "config:19"); PMU_FORMAT_ATTR(swfilt, "config2:0"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); +PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11"); PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); +PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_cap, "1"); static umode_t zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) @@ -615,6 +636,12 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; } +static umode_t +ibs_op_ldlat_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPLDLAT ? attr->mode : 0; +} + static struct attribute *fetch_attrs[] = { &format_attr_rand_en.attr, &format_attr_swfilt.attr, @@ -631,6 +658,11 @@ static struct attribute *zen4_ibs_extensions_attrs[] = { NULL, }; +static struct attribute *ibs_op_ldlat_cap_attrs[] = { + &ibs_op_ldlat_cap.attr.attr, + NULL, +}; + static struct attribute_group group_fetch_formats = { .name = "format", .attrs = fetch_attrs, @@ -648,6 +680,12 @@ static struct attribute_group group_zen4_ibs_extensions = { .is_visible = zen4_ibs_extensions_is_visible, }; +static struct attribute_group group_ibs_op_ldlat_cap = { + .name = "caps", + .attrs = ibs_op_ldlat_cap_attrs, + .is_visible = ibs_op_ldlat_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_fetch_formats, &empty_caps_group, @@ -686,6 +724,11 @@ static struct attribute_group group_op_formats = { .attrs = op_attrs, }; +static struct attribute *ibs_op_ldlat_format_attrs[] = { + &ibs_op_ldlat_format.attr.attr, + NULL, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, @@ -704,10 +747,18 @@ static const struct attribute_group *op_attr_groups[] = { NULL, }; +static struct attribute_group group_ibs_op_ldlat_format = { + .name = "format", + .attrs = ibs_op_ldlat_format_attrs, + .is_visible = ibs_op_ldlat_is_visible, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, &group_op_l3missonly, &group_zen4_ibs_extensions, + &group_ibs_op_ldlat_cap, + &group_ibs_op_ldlat_format, NULL, }; @@ -1060,15 +1111,25 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, } } -static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, +static bool perf_ibs_is_mem_sample_type(struct perf_ibs *perf_ibs, + struct perf_event *event) +{ + u64 sample_type = event->attr.sample_type; + + return perf_ibs == &perf_ibs_op && + sample_type & (PERF_SAMPLE_DATA_SRC | + PERF_SAMPLE_WEIGHT_TYPE | + PERF_SAMPLE_ADDR | + PERF_SAMPLE_PHYS_ADDR); +} + +static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, + struct perf_event *event, int check_rip) { - if (sample_type & PERF_SAMPLE_RAW || - (perf_ibs == &perf_ibs_op && - (sample_type & PERF_SAMPLE_DATA_SRC || - sample_type & PERF_SAMPLE_WEIGHT_TYPE || - sample_type & PERF_SAMPLE_ADDR || - sample_type & PERF_SAMPLE_PHYS_ADDR))) + if (event->attr.sample_type & PERF_SAMPLE_RAW || + perf_ibs_is_mem_sample_type(perf_ibs, event) || + perf_ibs_ldlat_event(perf_ibs, event)) return perf_ibs->offset_max; else if (check_rip) return 3; @@ -1123,7 +1184,7 @@ fail: offset = 1; check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); - offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip); + offset_max = perf_ibs_get_offset_max(perf_ibs, event, check_rip); do { rdmsrl(msr + offset, *buf++); @@ -1132,6 +1193,22 @@ fail: perf_ibs->offset_max, offset + 1); } while (offset < offset_max); + + if (perf_ibs_ldlat_event(perf_ibs, event)) { + union ibs_op_data3 op_data3; + + op_data3.val = ibs_data.regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + /* + * Opening event is errored out if load latency threshold is + * outside of [128, 2048] range. Since the event has reached + * interrupt handler, we can safely assume the threshold is + * within [128, 2048] range. + */ + if (!op_data3.ld_op || !op_data3.dc_miss || + op_data3.dc_miss_lat <= (event->attr.config1 & 0xFFF)) + goto out; + } + /* * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately * depending on their availability. diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index cb2a5e113daa7..77f3a589a99ac 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -64,7 +64,8 @@ union ibs_op_ctl { opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved1:5; /* 59-63: reserved */ + ldlat_thrsh:4, /* 59-62: Load Latency threshold */ + ldlat_en:1; /* 63: Load Latency enabled */ }; }; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 73b104049f8cf..a60efe47498de 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -502,6 +502,7 @@ struct pebs_cntr_header { #define IBS_CAPS_FETCHCTLEXTD (1U<<9) #define IBS_CAPS_OPDATA4 (1U<<10) #define IBS_CAPS_ZEN4 (1U<<11) +#define IBS_CAPS_OPLDLAT (1U<<12) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -527,6 +528,8 @@ struct pebs_cntr_header { * The lower 7 bits of the current count are random bits * preloaded by hardware and ignored in software */ +#define IBS_OP_LDLAT_EN (1ULL<<63) +#define IBS_OP_LDLAT_THRSH (0xFULL<<59) #define IBS_OP_CUR_CNT (0xFFF80ULL<<32) #define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32) #define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52) -- GitLab From 0b347a4218da08b1eb400c259d193bff463dae87 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 5 Feb 2025 06:05:42 +0000 Subject: [PATCH 258/934] perf/amd/ibs: Update DTLB/PageSize decode logic IBS Op PMU on Zen5 reports DTLB and page size information differently compared to prior generation. The change is enumerated by CPUID_Fn8000001B_EAX[19]. IBS_OP_DATA3 Zen3/4 Zen5 ---------------------------------------------------------------- 19 IbsDcL2TlbHit1G Reserved ---------------------------------------------------------------- 6 IbsDcL2tlbHit2M Reserved ---------------------------------------------------------------- 5 IbsDcL1TlbHit1G PageSize: 4 IbsDcL1TlbHit2M 0 - 4K 1 - 2M 2 - 1G 3 - Reserved Valid only if IbsDcPhyAddrValid = 1 ---------------------------------------------------------------- 3 IbsDcL2TlbMiss IbsDcL2TlbMiss Valid only if IbsDcPhyAddrValid = 1 ---------------------------------------------------------------- 2 IbsDcL1tlbMiss IbsDcL1tlbMiss Valid only if IbsDcPhyAddrValid = 1 ---------------------------------------------------------------- o Currently, only bit 2 and 3 are interpreted by IBS NMI handler for PERF_SAMPLE_DATA_SRC. Add dependency on IbsDcPhyAddrValid for those bits. o Introduce new IBS Op PMU capability and expose it to userspace via PMU's sysfs directory. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250205060547.1337-3-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 23 +++++++++++++++++++++++ arch/x86/include/asm/perf_event.h | 1 + 2 files changed, 24 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 85b29b3f305b2..7b52b8e3a1851 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -629,6 +629,7 @@ PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11"); PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_cap, "1"); +PMU_EVENT_ATTR_STRING(dtlb_pgsize, ibs_op_dtlb_pgsize_cap, "1"); static umode_t zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) @@ -642,6 +643,12 @@ ibs_op_ldlat_is_visible(struct kobject *kobj, struct attribute *attr, int i) return ibs_caps & IBS_CAPS_OPLDLAT ? attr->mode : 0; } +static umode_t +ibs_op_dtlb_pgsize_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPDTLBPGSIZE ? attr->mode : 0; +} + static struct attribute *fetch_attrs[] = { &format_attr_rand_en.attr, &format_attr_swfilt.attr, @@ -663,6 +670,11 @@ static struct attribute *ibs_op_ldlat_cap_attrs[] = { NULL, }; +static struct attribute *ibs_op_dtlb_pgsize_cap_attrs[] = { + &ibs_op_dtlb_pgsize_cap.attr.attr, + NULL, +}; + static struct attribute_group group_fetch_formats = { .name = "format", .attrs = fetch_attrs, @@ -686,6 +698,12 @@ static struct attribute_group group_ibs_op_ldlat_cap = { .is_visible = ibs_op_ldlat_is_visible, }; +static struct attribute_group group_ibs_op_dtlb_pgsize_cap = { + .name = "caps", + .attrs = ibs_op_dtlb_pgsize_cap_attrs, + .is_visible = ibs_op_dtlb_pgsize_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_fetch_formats, &empty_caps_group, @@ -759,6 +777,7 @@ static const struct attribute_group *op_attr_update[] = { &group_zen4_ibs_extensions, &group_ibs_op_ldlat_cap, &group_ibs_op_ldlat_format, + &group_ibs_op_dtlb_pgsize_cap, NULL, }; @@ -1007,6 +1026,10 @@ static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, if (!op_data3->dc_lin_addr_valid) return; + if ((ibs_caps & IBS_CAPS_OPDTLBPGSIZE) && + !op_data3->dc_phy_addr_valid) + return; + if (!op_data3->dc_l1tlb_miss) { data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; return; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index a60efe47498de..43b17b7b6ac07 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -503,6 +503,7 @@ struct pebs_cntr_header { #define IBS_CAPS_OPDATA4 (1U<<10) #define IBS_CAPS_ZEN4 (1U<<11) #define IBS_CAPS_OPLDLAT (1U<<12) +#define IBS_CAPS_OPDTLBPGSIZE (1U<<19) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ -- GitLab From 3201bfa368fee5e70927e45222ff0b235352c01c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 5 Feb 2025 06:05:43 +0000 Subject: [PATCH 259/934] perf amd ibs: Sync arch/x86/include/asm/amd-ibs.h header with the kernel Sync load latency related bit fields into the tool's header copy Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250205060547.1337-4-ravi.bangoria@amd.com --- tools/arch/x86/include/asm/amd-ibs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/arch/x86/include/asm/amd-ibs.h b/tools/arch/x86/include/asm/amd-ibs.h index 93807b437e4de..cb1740bc3da2d 100644 --- a/tools/arch/x86/include/asm/amd-ibs.h +++ b/tools/arch/x86/include/asm/amd-ibs.h @@ -64,7 +64,8 @@ union ibs_op_ctl { opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved1:5; /* 59-63: reserved */ + ldlat_thrsh:4, /* 59-62: Load Latency threshold */ + ldlat_en:1; /* 63: Load Latency enabled */ }; }; -- GitLab From fbf10b86f6057cf79300720da4ea4b77e6708b0d Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Tue, 24 Dec 2024 12:34:56 +0900 Subject: [PATCH 260/934] firmware: imx-scu: fix OF node leak in .probe() imx_scu_probe() calls of_parse_phandle_with_args(), but does not release the OF node reference obtained by it. Add a of_node_put() call after done with the node. Fixes: f25a066d1a07 ("firmware: imx-scu: Support one TX and one RX") Signed-off-by: Joe Hattori Signed-off-by: Shawn Guo --- drivers/firmware/imx/imx-scu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/firmware/imx/imx-scu.c b/drivers/firmware/imx/imx-scu.c index 1dd4362ef9a3f..8c28e25ddc8a6 100644 --- a/drivers/firmware/imx/imx-scu.c +++ b/drivers/firmware/imx/imx-scu.c @@ -280,6 +280,7 @@ static int imx_scu_probe(struct platform_device *pdev) return ret; sc_ipc->fast_ipc = of_device_is_compatible(args.np, "fsl,imx8-mu-scu"); + of_node_put(args.np); num_channel = sc_ipc->fast_ipc ? 2 : SCU_MU_CHAN_NUM; for (i = 0; i < num_channel; i++) { -- GitLab From 38f59e0e8bd2b3e1319716e4aeaeb9a6223b006d Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Tue, 7 Jan 2025 16:03:09 +0100 Subject: [PATCH 261/934] arm64: dts: freescale: tqma8mpql: Fix vqmmc-supply eMMC is supplied by BUCK5 rail. Use the actual regulator instead of a virtual fixed regulator. Fixes: 418d1d840e421 ("arm64: dts: freescale: add initial device tree for TQMa8MPQL with i.MX8MP") Signed-off-by: Alexander Stein Signed-off-by: Shawn Guo --- .../boot/dts/freescale/imx8mp-tqma8mpql.dtsi | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi index 336785a9fba89..3ddc5aaa7c5f0 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-or-later OR MIT /* - * Copyright 2021-2022 TQ-Systems GmbH - * Author: Alexander Stein + * Copyright 2021-2025 TQ-Systems GmbH , + * D-82229 Seefeld, Germany. + * Author: Alexander Stein */ #include "imx8mp.dtsi" @@ -23,15 +24,6 @@ regulator-max-microvolt = <3300000>; regulator-always-on; }; - - /* e-MMC IO, needed for HS modes */ - reg_vcc1v8: regulator-vcc1v8 { - compatible = "regulator-fixed"; - regulator-name = "VCC1V8"; - regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <1800000>; - regulator-always-on; - }; }; &A53_0 { @@ -197,7 +189,7 @@ no-sd; no-sdio; vmmc-supply = <®_vcc3v3>; - vqmmc-supply = <®_vcc1v8>; + vqmmc-supply = <&buck5_reg>; status = "okay"; }; -- GitLab From 83964a29379cb08929a39172780a4c2992bc7c93 Mon Sep 17 00:00:00 2001 From: Stefan Eichenberger Date: Fri, 10 Jan 2025 16:18:29 +0100 Subject: [PATCH 262/934] ARM: dts: imx6qdl-apalis: Fix poweroff on Apalis iMX6 The current solution for powering off the Apalis iMX6 is not functioning as intended. To resolve this, it is necessary to power off the vgen2_reg, which will also set the POWER_ENABLE_MOCI signal to a low state. This ensures the carrier board is properly informed to initiate its power-off sequence. The new solution uses the regulator-poweroff driver, which will power off the regulator during a system shutdown. Cc: Fixes: 4eb56e26f92e ("ARM: dts: imx6q-apalis: Command pmic to standby for poweroff") Signed-off-by: Stefan Eichenberger Signed-off-by: Shawn Guo --- arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi b/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi index dffab5aa8b9ca..88be29166c1ae 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi @@ -108,6 +108,11 @@ }; }; + poweroff { + compatible = "regulator-poweroff"; + cpu-supply = <&vgen2_reg>; + }; + reg_module_3v3: regulator-module-3v3 { compatible = "regulator-fixed"; regulator-always-on; @@ -236,10 +241,6 @@ status = "disabled"; }; -&clks { - fsl,pmic-stby-poweroff; -}; - /* Apalis SPI1 */ &ecspi1 { cs-gpios = <&gpio5 25 GPIO_ACTIVE_LOW>; @@ -527,7 +528,6 @@ pmic: pmic@8 { compatible = "fsl,pfuze100"; - fsl,pmic-stby-poweroff; reg = <0x08>; regulators { -- GitLab From f54af4af7bd282c0ca9eef3f3bc239ae2fd9a58a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 3 Feb 2025 14:19:19 +0200 Subject: [PATCH 263/934] bitmap: Align documentation between bitmap_gather() and bitmap_scatter() The bitmap_scatter() mistakenly refers to itself for detailed explanation about the relationships of two. Instead of simply fixing this, align text in both making a cross-reference. Fixes: de5f84338970 ("lib/bitmap: Introduce bitmap_scatter() and bitmap_gather() helpers") Signed-off-by: Andy Shevchenko Signed-off-by: Yury Norov --- include/linux/bitmap.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 2026953e2c4ed..595217b7a6e71 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -560,9 +560,9 @@ void bitmap_replace(unsigned long *dst, * ...0..11...0..10 * dst: 0000001100000010 * - * A relationship exists between bitmap_scatter() and bitmap_gather(). + * A relationship exists between bitmap_scatter() and bitmap_gather(). See + * bitmap_gather() for the bitmap gather detailed operations. TL;DR: * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation. - * See bitmap_scatter() for details related to this relationship. */ static __always_inline void bitmap_scatter(unsigned long *dst, const unsigned long *src, @@ -608,7 +608,9 @@ void bitmap_scatter(unsigned long *dst, const unsigned long *src, * dst: 0000000000011010 * * A relationship exists between bitmap_gather() and bitmap_scatter(). See - * bitmap_scatter() for the bitmap scatter detailed operations. + * bitmap_scatter() for the bitmap scatter detailed operations. TL;DR: + * bitmap_scatter() can be seen as the 'reverse' bitmap_gather() operation. + * * Suppose scattered computed using bitmap_scatter(scattered, src, mask, n). * The operation bitmap_gather(result, scattered, mask, n) leads to a result * equal or equivalent to src. -- GitLab From 158e9d2f3366976d74a500dbbba633036ba8501b Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Fri, 7 Feb 2025 15:14:02 -0500 Subject: [PATCH 264/934] bitmap: remove _check_eq_u32_array This has been unused since commit 3aa56885e516 ("bitmap: replace bitmap_{from,to}_u32array") in 2018. Signed-off-by: Tamir Duberstein Reviewed-by: David Gow Signed-off-by: Yury Norov --- lib/test_bitmap.c | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 65a75d58ed9e4..c83829ef557f5 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -100,34 +100,6 @@ __check_eq_pbl(const char *srcfile, unsigned int line, return true; } -static bool __init -__check_eq_u32_array(const char *srcfile, unsigned int line, - const u32 *exp_arr, unsigned int exp_len, - const u32 *arr, unsigned int len) __used; -static bool __init -__check_eq_u32_array(const char *srcfile, unsigned int line, - const u32 *exp_arr, unsigned int exp_len, - const u32 *arr, unsigned int len) -{ - if (exp_len != len) { - pr_warn("[%s:%u] array length differ: expected %u, got %u\n", - srcfile, line, - exp_len, len); - return false; - } - - if (memcmp(exp_arr, arr, len*sizeof(*arr))) { - pr_warn("[%s:%u] array contents differ\n", srcfile, line); - print_hex_dump(KERN_WARNING, " exp: ", DUMP_PREFIX_OFFSET, - 32, 4, exp_arr, exp_len*sizeof(*exp_arr), false); - print_hex_dump(KERN_WARNING, " got: ", DUMP_PREFIX_OFFSET, - 32, 4, arr, len*sizeof(*arr), false); - return false; - } - - return true; -} - static bool __init __check_eq_clump8(const char *srcfile, unsigned int line, const unsigned int offset, const unsigned int size, -- GitLab From 9ffa4b35a62d9786ce418085f36af671df3aacaa Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 10 Feb 2025 21:51:06 -0500 Subject: [PATCH 265/934] cpumask: add for_each_{possible,online}_cpu_wrap The iterators are trivial extensions of for_each_cpu_wrap(). They are used in the following patches of the series to replace cpumask_next_wrap(). Signed-off-by: Yury Norov --- include/linux/cpumask.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 36a890d0dd575..e57fd41ca38e9 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1033,11 +1033,21 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_possible_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_online_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_present_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) + +#define for_each_possible_cpu_wrap(cpu, start) \ + for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) +#define for_each_online_cpu_wrap(cpu, start) \ + for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) #else #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_enabled_cpu(cpu) for_each_cpu((cpu), cpu_enabled_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) + +#define for_each_possible_cpu_wrap(cpu, start) \ + for_each_cpu_wrap((cpu), cpu_possible_mask, (start)) +#define for_each_online_cpu_wrap(cpu, start) \ + for_each_cpu_wrap((cpu), cpu_online_mask, (start)) #endif /* Wrappers for arch boot code to manipulate normally-constant masks */ -- GitLab From d81603b32cde95e5262c84004081b2e3cb4f23ed Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 28 Jan 2025 11:46:30 -0500 Subject: [PATCH 266/934] objpool: rework objpool_pop() The function has to track number of iterations to prevent an infinite loop. for_each_cpu_wrap() macro takes care of it, which simplifies user code. Signed-off-by: Yury Norov --- include/linux/objpool.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/objpool.h b/include/linux/objpool.h index cb1758eaa2d38..b713a1fe7521b 100644 --- a/include/linux/objpool.h +++ b/include/linux/objpool.h @@ -170,17 +170,16 @@ static inline void *objpool_pop(struct objpool_head *pool) { void *obj = NULL; unsigned long flags; - int i, cpu; + int start, cpu; /* disable local irq to avoid preemption & interruption */ raw_local_irq_save(flags); - cpu = raw_smp_processor_id(); - for (i = 0; i < pool->nr_possible_cpus; i++) { + start = raw_smp_processor_id(); + for_each_possible_cpu_wrap(cpu, start) { obj = __objpool_try_get_slot(pool, cpu); if (obj) break; - cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1); } raw_local_irq_restore(flags); -- GitLab From 01059219b0cfdb9fc0d5bd60458e614a3135e6e7 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 18 Feb 2025 19:04:41 +0100 Subject: [PATCH 267/934] sched_ext: idle: Introduce node-aware idle cpu kfunc helpers Introduce a new kfunc to retrieve the node associated to a CPU: int scx_bpf_cpu_node(s32 cpu) Add the following kfuncs to provide BPF schedulers direct access to per-node idle cpumasks information: const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) s32 scx_bpf_pick_idle_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) s32 scx_bpf_pick_any_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) Moreover, trigger an scx error when any of the non-node aware idle CPU kfuncs are used when SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled. Cc: Yury Norov [NVIDIA] Signed-off-by: Andrea Righi Reviewed-by: Yury Norov [NVIDIA] Signed-off-by: Tejun Heo --- kernel/sched/ext_idle.c | 182 +++++++++++++++++++++++ tools/sched_ext/include/scx/common.bpf.h | 5 + tools/sched_ext/include/scx/compat.bpf.h | 31 ++++ 3 files changed, 218 insertions(+) diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 8dacccc82ed63..759a06774b5b3 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -728,6 +728,33 @@ void scx_idle_disable(void) /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ + +static int validate_node(int node) +{ + if (!static_branch_likely(&scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is disabled"); + return -EOPNOTSUPP; + } + + /* Return no entry for NUMA_NO_NODE (not a critical scx error) */ + if (node == NUMA_NO_NODE) + return -ENOENT; + + /* Make sure node is in a valid range */ + if (node < 0 || node >= nr_node_ids) { + scx_ops_error("invalid node %d", node); + return -EINVAL; + } + + /* Make sure the node is part of the set of possible nodes */ + if (!node_possible(node)) { + scx_ops_error("unavailable node %d", node); + return -EINVAL; + } + + return node; +} + __bpf_kfunc_start_defs(); static bool check_builtin_idle_enabled(void) @@ -739,6 +766,23 @@ static bool check_builtin_idle_enabled(void) return false; } +/** + * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or + * trigger an error if @cpu is invalid + * @cpu: target CPU + */ +__bpf_kfunc int scx_bpf_cpu_node(s32 cpu) +{ +#ifdef CONFIG_NUMA + if (!ops_cpu_valid(cpu, NULL)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +#else + return 0; +#endif +} + /** * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() * @p: task_struct to select a CPU for @@ -771,6 +815,28 @@ prev_cpu: return prev_cpu; } +/** + * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the + * idle-tracking per-CPU cpumask of a target NUMA node. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_ops_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + /** * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking * per-CPU cpumask. @@ -795,6 +861,32 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) #endif } +/** + * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the + * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be + * used to determine if an entire physical core is free. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_ops_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_cpumask(node)->smt; + else + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + /** * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, * per-physical-core cpumask. Can be used to determine if an entire physical @@ -859,6 +951,35 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) return false; } +/** + * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_* flags + * + * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node. + * + * Returns the picked idle cpu number on success, or -%EBUSY if no matching + * cpu was found. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node). + * + * Always returns an error if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if + * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + node = validate_node(node); + if (node < 0) + return node; + + return scx_pick_idle_cpu(cpus_allowed, node, flags); +} + /** * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu * @cpus_allowed: Allowed cpumask @@ -877,16 +998,64 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) * * Unavailable if ops.update_idle() is implemented and * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_idle_cpu_node() instead. */ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is enabled"); + return -EBUSY; + } + if (!check_builtin_idle_enabled()) return -EBUSY; return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); } +/** + * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available + * or pick any CPU from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node). + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + s32 cpu; + + node = validate_node(node); + if (node < 0) + return node; + + cpu = scx_pick_idle_cpu(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + /** * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU * @cpus_allowed: Allowed cpumask @@ -900,12 +1069,20 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not * set, this function can't tell which CPUs are idle and will always pick any * CPU. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_any_cpu_node() instead. */ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, u64 flags) { s32 cpu; + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is enabled"); + return -EBUSY; + } + if (static_branch_likely(&scx_builtin_idle_enabled)) { cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); if (cpu >= 0) @@ -922,11 +1099,16 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_idle) +BTF_ID_FLAGS(func, scx_bpf_cpu_node) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_idle) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 77bbe0199a32c..ca7cc4108b45f 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -71,14 +71,19 @@ u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak; u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak; void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak; u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; +int scx_bpf_cpu_node(s32 cpu) __ksym __weak; const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak; +const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) __ksym __weak; const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; +const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) __ksym __weak; const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; +s32 scx_bpf_pick_idle_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak; s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +s32 scx_bpf_pick_any_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak; s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index e5fa72f9bf22b..5a9bcdb52e797 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -182,6 +182,37 @@ static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags) scx_bpf_now() : \ bpf_ktime_get_ns()) +/* + * + * v6.15: Introduce NUMA-aware kfuncs to operate with per-node idle + * cpumasks. + * + * Preserve the following __COMPAT_scx_*_node macros until v6.17. + */ +#define __COMPAT_scx_bpf_cpu_node(cpu) \ + (bpf_ksym_exists(scx_bpf_cpu_node) ? \ + scx_bpf_cpu_node(cpu) : 0) + +#define __COMPAT_scx_bpf_get_idle_cpumask_node(node) \ + (bpf_ksym_exists(scx_bpf_get_idle_cpumask_node) ? \ + scx_bpf_get_idle_cpumask_node(node) : \ + scx_bpf_get_idle_cpumask()) \ + +#define __COMPAT_scx_bpf_get_idle_smtmask_node(node) \ + (bpf_ksym_exists(scx_bpf_get_idle_smtmask_node) ? \ + scx_bpf_get_idle_smtmask_node(node) : \ + scx_bpf_get_idle_smtmask()) + +#define __COMPAT_scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags) \ + (bpf_ksym_exists(scx_bpf_pick_idle_cpu_node) ? \ + scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags) : \ + scx_bpf_pick_idle_cpu(cpus_allowed, flags)) + +#define __COMPAT_scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) \ + (bpf_ksym_exists(scx_bpf_pick_any_cpu_node) ? \ + scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) : \ + scx_bpf_pick_any_cpu(cpus_allowed, flags)) + /* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). -- GitLab From 57937eac1f78df18146423c8a5949dbebbce84f3 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Feb 2025 15:43:03 +0000 Subject: [PATCH 268/934] kernel-docs: Add book to process/kernel-docs.rst Add a reference to my new book, The Linux Memory Manager, an in-depth exploration of the memory management subsystem, to process/kernel-docs.rst. This is not yet published, but the full draft is available on pre-order, so it seems worthwhile adding it here. The situation is made clear in the 'notes' section. The 'pre-release' was made available in February 2025, and full release is scheduled for Fall 2025. The book's ISBN-13 is 978-1718504462. The document will be updated upon release to reflect this. Signed-off-by: Lorenzo Stoakes Reviewed-by: Carlos Bilbao Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20250218154303.45595-1-lorenzo.stoakes@oracle.com --- Documentation/process/kernel-docs.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/process/kernel-docs.rst b/Documentation/process/kernel-docs.rst index 3b5b5983fea83..c67ac12cf789d 100644 --- a/Documentation/process/kernel-docs.rst +++ b/Documentation/process/kernel-docs.rst @@ -75,6 +75,17 @@ On-line docs Published books --------------- + * Title: **The Linux Memory Manager** + + :Author: Lorenzo Stoakes + :Publisher: No Starch Press + :Date: February 2025 + :Pages: 1300 + :ISBN: 978-1718504462 + :Notes: Memory management. Full draft available as early access for + pre-order, full release scheduled for Fall 2025. See + https://nostarch.com/linux-memory-manager for further info. + * Title: **Practical Linux System Administration: A Guide to Installation, Configuration, and Management, 1st Edition** :Author: Kenneth Hess -- GitLab From 8b2ee518fc062320416359379c2c74dcc8a4de71 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Tue, 18 Feb 2025 09:08:45 +0200 Subject: [PATCH 269/934] Documentation/kernel-parameters: fix typo in description of reserve_mem The format description of reserve_mem uses [KNG] as units, rather than [KMG]. Fix it. Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Randy Dunlap Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20250218070845.3769520-1-rppt@kernel.org --- Documentation/admin-guide/kernel-parameters.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb8752b42ec85..bf00552908bc6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6082,7 +6082,7 @@ is assumed to be I/O ports; otherwise it is memory. reserve_mem= [RAM] - Format: nn[KNG]::